diff --git a/go.mod b/go.mod
index 1c30ec092..4ccdfecdf 100644
--- a/go.mod
+++ b/go.mod
@@ -84,6 +84,7 @@ require (
github.com/ipfs/go-log/v2 v2.5.1
github.com/jellydator/ttlcache/v3 v3.2.0
github.com/jmoiron/sqlx v1.3.5
+ github.com/klauspost/reedsolomon v1.12.1
github.com/ladydascalie/currency v1.6.0
github.com/meirf/gopart v0.0.0-20180520194036-37e9492a85a8
github.com/mutecomm/go-sqlcipher/v4 v4.4.2
@@ -177,7 +178,7 @@ require (
github.com/jackpal/go-nat-pmp v1.0.2 // indirect
github.com/jbenet/go-temp-err-catcher v0.1.0 // indirect
github.com/klauspost/compress v1.16.7 // indirect
- github.com/klauspost/cpuid/v2 v2.2.5 // indirect
+ github.com/klauspost/cpuid/v2 v2.2.7 // indirect
github.com/koron/go-ssdp v0.0.4 // indirect
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
@@ -279,7 +280,7 @@ require (
go.uber.org/fx v1.20.0 // indirect
golang.org/x/mod v0.12.0 // indirect
golang.org/x/sync v0.3.0 // indirect
- golang.org/x/sys v0.11.0 // indirect
+ golang.org/x/sys v0.18.0 // indirect
golang.org/x/term v0.11.0 // indirect
golang.org/x/tools v0.12.1-0.20230818130535-1517d1a3ba60 // indirect
golang.org/x/xerrors v0.0.0-20220609144429-65e65417b02f // indirect
diff --git a/go.sum b/go.sum
index 466cb1c7a..dde981db2 100644
--- a/go.sum
+++ b/go.sum
@@ -1295,10 +1295,12 @@ github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQs
github.com/klauspost/cpuid v0.0.0-20170728055534-ae7887de9fa5/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/klauspost/cpuid/v2 v2.0.4/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
-github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg=
-github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
+github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
+github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6/go.mod h1:+ZoRqAPRLkC4NPOvfYeR5KNOrY6TD+/sAC3HXPZgDYg=
github.com/klauspost/pgzip v1.0.2-0.20170402124221-0bf5dcad4ada/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
+github.com/klauspost/reedsolomon v1.12.1 h1:NhWgum1efX1x58daOBGCFWcxtEhOhXKKl1HAPQUp03Q=
+github.com/klauspost/reedsolomon v1.12.1/go.mod h1:nEi5Kjb6QqtbofI6s+cbG/j1da11c96IBYBSnVGtuBs=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
@@ -2637,8 +2639,8 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM=
-golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
+golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
diff --git a/vendor/github.com/klauspost/cpuid/v2/README.md b/vendor/github.com/klauspost/cpuid/v2/README.md
index accd7abaf..30f8d2963 100644
--- a/vendor/github.com/klauspost/cpuid/v2/README.md
+++ b/vendor/github.com/klauspost/cpuid/v2/README.md
@@ -9,10 +9,7 @@ You can access the CPU information by accessing the shared CPU variable of the c
Package home: https://github.com/klauspost/cpuid
[![PkgGoDev](https://pkg.go.dev/badge/github.com/klauspost/cpuid)](https://pkg.go.dev/github.com/klauspost/cpuid/v2)
-[![Build Status][3]][4]
-
-[3]: https://travis-ci.org/klauspost/cpuid.svg?branch=master
-[4]: https://travis-ci.org/klauspost/cpuid
+[![Go](https://github.com/klauspost/cpuid/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/cpuid/actions/workflows/go.yml)
## installing
@@ -285,7 +282,12 @@ Exit Code 1
| AMXINT8 | Tile computational operations on 8-bit integers |
| AMXFP16 | Tile computational operations on FP16 numbers |
| AMXTILE | Tile architecture |
+| APX_F | Intel APX |
| AVX | AVX functions |
+| AVX10 | If set the Intel AVX10 Converged Vector ISA is supported |
+| AVX10_128 | If set indicates that AVX10 128-bit vector support is present |
+| AVX10_256 | If set indicates that AVX10 256-bit vector support is present |
+| AVX10_512 | If set indicates that AVX10 512-bit vector support is present |
| AVX2 | AVX2 functions |
| AVX512BF16 | AVX-512 BFLOAT16 Instructions |
| AVX512BITALG | AVX-512 Bit Algorithms |
@@ -365,6 +367,8 @@ Exit Code 1
| IDPRED_CTRL | IPRED_DIS |
| INT_WBINVD | WBINVD/WBNOINVD are interruptible. |
| INVLPGB | NVLPGB and TLBSYNC instruction supported |
+| KEYLOCKER | Key locker |
+| KEYLOCKERW | Key locker wide |
| LAHF | LAHF/SAHF in long mode |
| LAM | If set, CPU supports Linear Address Masking |
| LBRVIRT | LBR virtualization |
@@ -380,7 +384,7 @@ Exit Code 1
| MOVDIRI | Move Doubleword as Direct Store |
| MOVSB_ZL | Fast Zero-Length MOVSB |
| MPX | Intel MPX (Memory Protection Extensions) |
-| MOVU | MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD |
+| MOVU | MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD |
| MSRIRC | Instruction Retired Counter MSR available |
| MSRLIST | Read/Write List of Model Specific Registers |
| MSR_PAGEFLUSH | Page Flush MSR available |
diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid.go b/vendor/github.com/klauspost/cpuid/v2/cpuid.go
index d015c744e..805f5e7b4 100644
--- a/vendor/github.com/klauspost/cpuid/v2/cpuid.go
+++ b/vendor/github.com/klauspost/cpuid/v2/cpuid.go
@@ -67,188 +67,200 @@ const (
// Keep index -1 as unknown
UNKNOWN = -1
- // Add features
- ADX FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
- AESNI // Advanced Encryption Standard New Instructions
- AMD3DNOW // AMD 3DNOW
- AMD3DNOWEXT // AMD 3DNowExt
- AMXBF16 // Tile computational operations on BFLOAT16 numbers
- AMXFP16 // Tile computational operations on FP16 numbers
- AMXINT8 // Tile computational operations on 8-bit integers
- AMXTILE // Tile architecture
- AVX // AVX functions
- AVX2 // AVX2 functions
- AVX512BF16 // AVX-512 BFLOAT16 Instructions
- AVX512BITALG // AVX-512 Bit Algorithms
- AVX512BW // AVX-512 Byte and Word Instructions
- AVX512CD // AVX-512 Conflict Detection Instructions
- AVX512DQ // AVX-512 Doubleword and Quadword Instructions
- AVX512ER // AVX-512 Exponential and Reciprocal Instructions
- AVX512F // AVX-512 Foundation
- AVX512FP16 // AVX-512 FP16 Instructions
- AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions
- AVX512PF // AVX-512 Prefetch Instructions
- AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions
- AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2
- AVX512VL // AVX-512 Vector Length Extensions
- AVX512VNNI // AVX-512 Vector Neural Network Instructions
- AVX512VP2INTERSECT // AVX-512 Intersect for D/Q
- AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword
- AVXIFMA // AVX-IFMA instructions
- AVXNECONVERT // AVX-NE-CONVERT instructions
- AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one
- AVXVNNI // AVX (VEX encoded) VNNI neural network instructions
- AVXVNNIINT8 // AVX-VNNI-INT8 instructions
- BHI_CTRL // Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598
- BMI1 // Bit Manipulation Instruction Set 1
- BMI2 // Bit Manipulation Instruction Set 2
- CETIBT // Intel CET Indirect Branch Tracking
- CETSS // Intel CET Shadow Stack
- CLDEMOTE // Cache Line Demote
- CLMUL // Carry-less Multiplication
- CLZERO // CLZERO instruction supported
- CMOV // i686 CMOV
- CMPCCXADD // CMPCCXADD instructions
- CMPSB_SCADBS_SHORT // Fast short CMPSB and SCASB
- CMPXCHG8 // CMPXCHG8 instruction
- CPBOOST // Core Performance Boost
- CPPC // AMD: Collaborative Processor Performance Control
- CX16 // CMPXCHG16B Instruction
- EFER_LMSLE_UNS // AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ
- ENQCMD // Enqueue Command
- ERMS // Enhanced REP MOVSB/STOSB
- F16C // Half-precision floating-point conversion
- FLUSH_L1D // Flush L1D cache
- FMA3 // Intel FMA 3. Does not imply AVX.
- FMA4 // Bulldozer FMA4 functions
- FP128 // AMD: When set, the internal FP/SIMD execution datapath is no more than 128-bits wide
- FP256 // AMD: When set, the internal FP/SIMD execution datapath is no more than 256-bits wide
- FSRM // Fast Short Rep Mov
- FXSR // FXSAVE, FXRESTOR instructions, CR4 bit 9
- FXSROPT // FXSAVE/FXRSTOR optimizations
- GFNI // Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage.
- HLE // Hardware Lock Elision
- HRESET // If set CPU supports history reset and the IA32_HRESET_ENABLE MSR
- HTT // Hyperthreading (enabled)
- HWA // Hardware assert supported. Indicates support for MSRC001_10
- HYBRID_CPU // This part has CPUs of more than one type.
- HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors
- IA32_ARCH_CAP // IA32_ARCH_CAPABILITIES MSR (Intel)
- IA32_CORE_CAP // IA32_CORE_CAPABILITIES MSR
- IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB)
- IBRS // AMD: Indirect Branch Restricted Speculation
- IBRS_PREFERRED // AMD: IBRS is preferred over software solution
- IBRS_PROVIDES_SMP // AMD: IBRS provides Same Mode Protection
- IBS // Instruction Based Sampling (AMD)
- IBSBRNTRGT // Instruction Based Sampling Feature (AMD)
- IBSFETCHSAM // Instruction Based Sampling Feature (AMD)
- IBSFFV // Instruction Based Sampling Feature (AMD)
- IBSOPCNT // Instruction Based Sampling Feature (AMD)
- IBSOPCNTEXT // Instruction Based Sampling Feature (AMD)
- IBSOPSAM // Instruction Based Sampling Feature (AMD)
- IBSRDWROPCNT // Instruction Based Sampling Feature (AMD)
- IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD)
- IBS_FETCH_CTLX // AMD: IBS fetch control extended MSR supported
- IBS_OPDATA4 // AMD: IBS op data 4 MSR supported
- IBS_OPFUSE // AMD: Indicates support for IbsOpFuse
- IBS_PREVENTHOST // Disallowing IBS use by the host supported
- IBS_ZEN4 // AMD: Fetch and Op IBS support IBS extensions added with Zen4
- IDPRED_CTRL // IPRED_DIS
- INT_WBINVD // WBINVD/WBNOINVD are interruptible.
- INVLPGB // NVLPGB and TLBSYNC instruction supported
- LAHF // LAHF/SAHF in long mode
- LAM // If set, CPU supports Linear Address Masking
- LBRVIRT // LBR virtualization
- LZCNT // LZCNT instruction
- MCAOVERFLOW // MCA overflow recovery support.
- MCDT_NO // Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it.
- MCOMMIT // MCOMMIT instruction supported
- MD_CLEAR // VERW clears CPU buffers
- MMX // standard MMX
- MMXEXT // SSE integer functions or AMD MMX ext
- MOVBE // MOVBE instruction (big-endian)
- MOVDIR64B // Move 64 Bytes as Direct Store
- MOVDIRI // Move Doubleword as Direct Store
- MOVSB_ZL // Fast Zero-Length MOVSB
- MOVU // AMD: MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD
- MPX // Intel MPX (Memory Protection Extensions)
- MSRIRC // Instruction Retired Counter MSR available
- MSRLIST // Read/Write List of Model Specific Registers
- MSR_PAGEFLUSH // Page Flush MSR available
- NRIPS // Indicates support for NRIP save on VMEXIT
- NX // NX (No-Execute) bit
- OSXSAVE // XSAVE enabled by OS
- PCONFIG // PCONFIG for Intel Multi-Key Total Memory Encryption
- POPCNT // POPCNT instruction
- PPIN // AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled
- PREFETCHI // PREFETCHIT0/1 instructions
- PSFD // Predictive Store Forward Disable
- RDPRU // RDPRU instruction supported
- RDRAND // RDRAND instruction is available
- RDSEED // RDSEED instruction is available
- RDTSCP // RDTSCP Instruction
- RRSBA_CTRL // Restricted RSB Alternate
- RTM // Restricted Transactional Memory
- RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort.
- SERIALIZE // Serialize Instruction Execution
- SEV // AMD Secure Encrypted Virtualization supported
- SEV_64BIT // AMD SEV guest execution only allowed from a 64-bit host
- SEV_ALTERNATIVE // AMD SEV Alternate Injection supported
- SEV_DEBUGSWAP // Full debug state swap supported for SEV-ES guests
- SEV_ES // AMD SEV Encrypted State supported
- SEV_RESTRICTED // AMD SEV Restricted Injection supported
- SEV_SNP // AMD SEV Secure Nested Paging supported
- SGX // Software Guard Extensions
- SGXLC // Software Guard Extensions Launch Control
- SHA // Intel SHA Extensions
- SME // AMD Secure Memory Encryption supported
- SME_COHERENT // AMD Hardware cache coherency across encryption domains enforced
- SPEC_CTRL_SSBD // Speculative Store Bypass Disable
- SRBDS_CTRL // SRBDS mitigation MSR available
- SSE // SSE functions
- SSE2 // P4 SSE functions
- SSE3 // Prescott SSE3 functions
- SSE4 // Penryn SSE4.1 functions
- SSE42 // Nehalem SSE4.2 functions
- SSE4A // AMD Barcelona microarchitecture SSE4a instructions
- SSSE3 // Conroe SSSE3 functions
- STIBP // Single Thread Indirect Branch Predictors
- STIBP_ALWAYSON // AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On
- STOSB_SHORT // Fast short STOSB
- SUCCOR // Software uncorrectable error containment and recovery capability.
- SVM // AMD Secure Virtual Machine
- SVMDA // Indicates support for the SVM decode assists.
- SVMFBASID // SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control
- SVML // AMD SVM lock. Indicates support for SVM-Lock.
- SVMNP // AMD SVM nested paging
- SVMPF // SVM pause intercept filter. Indicates support for the pause intercept filter
- SVMPFT // SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold
- SYSCALL // System-Call Extension (SCE): SYSCALL and SYSRET instructions.
- SYSEE // SYSENTER and SYSEXIT instructions
- TBM // AMD Trailing Bit Manipulation
- TDX_GUEST // Intel Trust Domain Extensions Guest
- TLB_FLUSH_NESTED // AMD: Flushing includes all the nested translations for guest translations
- TME // Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE.
- TOPEXT // TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX.
- TSCRATEMSR // MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104
- TSXLDTRK // Intel TSX Suspend Load Address Tracking
- VAES // Vector AES. AVX(512) versions requires additional checks.
- VMCBCLEAN // VMCB clean bits. Indicates support for VMCB clean bits.
- VMPL // AMD VM Permission Levels supported
- VMSA_REGPROT // AMD VMSA Register Protection supported
- VMX // Virtual Machine Extensions
- VPCLMULQDQ // Carry-Less Multiplication Quadword. Requires AVX for 3 register versions.
- VTE // AMD Virtual Transparent Encryption supported
- WAITPKG // TPAUSE, UMONITOR, UMWAIT
- WBNOINVD // Write Back and Do Not Invalidate Cache
- WRMSRNS // Non-Serializing Write to Model Specific Register
- X87 // FPU
- XGETBV1 // Supports XGETBV with ECX = 1
- XOP // Bulldozer XOP functions
- XSAVE // XSAVE, XRESTOR, XSETBV, XGETBV
- XSAVEC // Supports XSAVEC and the compacted form of XRSTOR.
- XSAVEOPT // XSAVEOPT available
- XSAVES // Supports XSAVES/XRSTORS and IA32_XSS
+ // x86 features
+ ADX FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+ AESNI // Advanced Encryption Standard New Instructions
+ AMD3DNOW // AMD 3DNOW
+ AMD3DNOWEXT // AMD 3DNowExt
+ AMXBF16 // Tile computational operations on BFLOAT16 numbers
+ AMXFP16 // Tile computational operations on FP16 numbers
+ AMXINT8 // Tile computational operations on 8-bit integers
+ AMXTILE // Tile architecture
+ APX_F // Intel APX
+ AVX // AVX functions
+ AVX10 // If set the Intel AVX10 Converged Vector ISA is supported
+ AVX10_128 // If set indicates that AVX10 128-bit vector support is present
+ AVX10_256 // If set indicates that AVX10 256-bit vector support is present
+ AVX10_512 // If set indicates that AVX10 512-bit vector support is present
+ AVX2 // AVX2 functions
+ AVX512BF16 // AVX-512 BFLOAT16 Instructions
+ AVX512BITALG // AVX-512 Bit Algorithms
+ AVX512BW // AVX-512 Byte and Word Instructions
+ AVX512CD // AVX-512 Conflict Detection Instructions
+ AVX512DQ // AVX-512 Doubleword and Quadword Instructions
+ AVX512ER // AVX-512 Exponential and Reciprocal Instructions
+ AVX512F // AVX-512 Foundation
+ AVX512FP16 // AVX-512 FP16 Instructions
+ AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions
+ AVX512PF // AVX-512 Prefetch Instructions
+ AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions
+ AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2
+ AVX512VL // AVX-512 Vector Length Extensions
+ AVX512VNNI // AVX-512 Vector Neural Network Instructions
+ AVX512VP2INTERSECT // AVX-512 Intersect for D/Q
+ AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword
+ AVXIFMA // AVX-IFMA instructions
+ AVXNECONVERT // AVX-NE-CONVERT instructions
+ AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one
+ AVXVNNI // AVX (VEX encoded) VNNI neural network instructions
+ AVXVNNIINT8 // AVX-VNNI-INT8 instructions
+ BHI_CTRL // Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598
+ BMI1 // Bit Manipulation Instruction Set 1
+ BMI2 // Bit Manipulation Instruction Set 2
+ CETIBT // Intel CET Indirect Branch Tracking
+ CETSS // Intel CET Shadow Stack
+ CLDEMOTE // Cache Line Demote
+ CLMUL // Carry-less Multiplication
+ CLZERO // CLZERO instruction supported
+ CMOV // i686 CMOV
+ CMPCCXADD // CMPCCXADD instructions
+ CMPSB_SCADBS_SHORT // Fast short CMPSB and SCASB
+ CMPXCHG8 // CMPXCHG8 instruction
+ CPBOOST // Core Performance Boost
+ CPPC // AMD: Collaborative Processor Performance Control
+ CX16 // CMPXCHG16B Instruction
+ EFER_LMSLE_UNS // AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ
+ ENQCMD // Enqueue Command
+ ERMS // Enhanced REP MOVSB/STOSB
+ F16C // Half-precision floating-point conversion
+ FLUSH_L1D // Flush L1D cache
+ FMA3 // Intel FMA 3. Does not imply AVX.
+ FMA4 // Bulldozer FMA4 functions
+ FP128 // AMD: When set, the internal FP/SIMD execution datapath is no more than 128-bits wide
+ FP256 // AMD: When set, the internal FP/SIMD execution datapath is no more than 256-bits wide
+ FSRM // Fast Short Rep Mov
+ FXSR // FXSAVE, FXRESTOR instructions, CR4 bit 9
+ FXSROPT // FXSAVE/FXRSTOR optimizations
+ GFNI // Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage.
+ HLE // Hardware Lock Elision
+ HRESET // If set CPU supports history reset and the IA32_HRESET_ENABLE MSR
+ HTT // Hyperthreading (enabled)
+ HWA // Hardware assert supported. Indicates support for MSRC001_10
+ HYBRID_CPU // This part has CPUs of more than one type.
+ HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors
+ IA32_ARCH_CAP // IA32_ARCH_CAPABILITIES MSR (Intel)
+ IA32_CORE_CAP // IA32_CORE_CAPABILITIES MSR
+ IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB)
+ IBPB_BRTYPE // Indicates that MSR 49h (PRED_CMD) bit 0 (IBPB) flushes all branch type predictions from the CPU branch predictor
+ IBRS // AMD: Indirect Branch Restricted Speculation
+ IBRS_PREFERRED // AMD: IBRS is preferred over software solution
+ IBRS_PROVIDES_SMP // AMD: IBRS provides Same Mode Protection
+ IBS // Instruction Based Sampling (AMD)
+ IBSBRNTRGT // Instruction Based Sampling Feature (AMD)
+ IBSFETCHSAM // Instruction Based Sampling Feature (AMD)
+ IBSFFV // Instruction Based Sampling Feature (AMD)
+ IBSOPCNT // Instruction Based Sampling Feature (AMD)
+ IBSOPCNTEXT // Instruction Based Sampling Feature (AMD)
+ IBSOPSAM // Instruction Based Sampling Feature (AMD)
+ IBSRDWROPCNT // Instruction Based Sampling Feature (AMD)
+ IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD)
+ IBS_FETCH_CTLX // AMD: IBS fetch control extended MSR supported
+ IBS_OPDATA4 // AMD: IBS op data 4 MSR supported
+ IBS_OPFUSE // AMD: Indicates support for IbsOpFuse
+ IBS_PREVENTHOST // Disallowing IBS use by the host supported
+ IBS_ZEN4 // AMD: Fetch and Op IBS support IBS extensions added with Zen4
+ IDPRED_CTRL // IPRED_DIS
+ INT_WBINVD // WBINVD/WBNOINVD are interruptible.
+ INVLPGB // NVLPGB and TLBSYNC instruction supported
+ KEYLOCKER // Key locker
+ KEYLOCKERW // Key locker wide
+ LAHF // LAHF/SAHF in long mode
+ LAM // If set, CPU supports Linear Address Masking
+ LBRVIRT // LBR virtualization
+ LZCNT // LZCNT instruction
+ MCAOVERFLOW // MCA overflow recovery support.
+ MCDT_NO // Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it.
+ MCOMMIT // MCOMMIT instruction supported
+ MD_CLEAR // VERW clears CPU buffers
+ MMX // standard MMX
+ MMXEXT // SSE integer functions or AMD MMX ext
+ MOVBE // MOVBE instruction (big-endian)
+ MOVDIR64B // Move 64 Bytes as Direct Store
+ MOVDIRI // Move Doubleword as Direct Store
+ MOVSB_ZL // Fast Zero-Length MOVSB
+ MOVU // AMD: MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD
+ MPX // Intel MPX (Memory Protection Extensions)
+ MSRIRC // Instruction Retired Counter MSR available
+ MSRLIST // Read/Write List of Model Specific Registers
+ MSR_PAGEFLUSH // Page Flush MSR available
+ NRIPS // Indicates support for NRIP save on VMEXIT
+ NX // NX (No-Execute) bit
+ OSXSAVE // XSAVE enabled by OS
+ PCONFIG // PCONFIG for Intel Multi-Key Total Memory Encryption
+ POPCNT // POPCNT instruction
+ PPIN // AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled
+ PREFETCHI // PREFETCHIT0/1 instructions
+ PSFD // Predictive Store Forward Disable
+ RDPRU // RDPRU instruction supported
+ RDRAND // RDRAND instruction is available
+ RDSEED // RDSEED instruction is available
+ RDTSCP // RDTSCP Instruction
+ RRSBA_CTRL // Restricted RSB Alternate
+ RTM // Restricted Transactional Memory
+ RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort.
+ SBPB // Indicates support for the Selective Branch Predictor Barrier
+ SERIALIZE // Serialize Instruction Execution
+ SEV // AMD Secure Encrypted Virtualization supported
+ SEV_64BIT // AMD SEV guest execution only allowed from a 64-bit host
+ SEV_ALTERNATIVE // AMD SEV Alternate Injection supported
+ SEV_DEBUGSWAP // Full debug state swap supported for SEV-ES guests
+ SEV_ES // AMD SEV Encrypted State supported
+ SEV_RESTRICTED // AMD SEV Restricted Injection supported
+ SEV_SNP // AMD SEV Secure Nested Paging supported
+ SGX // Software Guard Extensions
+ SGXLC // Software Guard Extensions Launch Control
+ SHA // Intel SHA Extensions
+ SME // AMD Secure Memory Encryption supported
+ SME_COHERENT // AMD Hardware cache coherency across encryption domains enforced
+ SPEC_CTRL_SSBD // Speculative Store Bypass Disable
+ SRBDS_CTRL // SRBDS mitigation MSR available
+ SRSO_MSR_FIX // Indicates that software may use MSR BP_CFG[BpSpecReduce] to mitigate SRSO.
+ SRSO_NO // Indicates the CPU is not subject to the SRSO vulnerability
+ SRSO_USER_KERNEL_NO // Indicates the CPU is not subject to the SRSO vulnerability across user/kernel boundaries
+ SSE // SSE functions
+ SSE2 // P4 SSE functions
+ SSE3 // Prescott SSE3 functions
+ SSE4 // Penryn SSE4.1 functions
+ SSE42 // Nehalem SSE4.2 functions
+ SSE4A // AMD Barcelona microarchitecture SSE4a instructions
+ SSSE3 // Conroe SSSE3 functions
+ STIBP // Single Thread Indirect Branch Predictors
+ STIBP_ALWAYSON // AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On
+ STOSB_SHORT // Fast short STOSB
+ SUCCOR // Software uncorrectable error containment and recovery capability.
+ SVM // AMD Secure Virtual Machine
+ SVMDA // Indicates support for the SVM decode assists.
+ SVMFBASID // SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control
+ SVML // AMD SVM lock. Indicates support for SVM-Lock.
+ SVMNP // AMD SVM nested paging
+ SVMPF // SVM pause intercept filter. Indicates support for the pause intercept filter
+ SVMPFT // SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold
+ SYSCALL // System-Call Extension (SCE): SYSCALL and SYSRET instructions.
+ SYSEE // SYSENTER and SYSEXIT instructions
+ TBM // AMD Trailing Bit Manipulation
+ TDX_GUEST // Intel Trust Domain Extensions Guest
+ TLB_FLUSH_NESTED // AMD: Flushing includes all the nested translations for guest translations
+ TME // Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE.
+ TOPEXT // TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX.
+ TSCRATEMSR // MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104
+ TSXLDTRK // Intel TSX Suspend Load Address Tracking
+ VAES // Vector AES. AVX(512) versions requires additional checks.
+ VMCBCLEAN // VMCB clean bits. Indicates support for VMCB clean bits.
+ VMPL // AMD VM Permission Levels supported
+ VMSA_REGPROT // AMD VMSA Register Protection supported
+ VMX // Virtual Machine Extensions
+ VPCLMULQDQ // Carry-Less Multiplication Quadword. Requires AVX for 3 register versions.
+ VTE // AMD Virtual Transparent Encryption supported
+ WAITPKG // TPAUSE, UMONITOR, UMWAIT
+ WBNOINVD // Write Back and Do Not Invalidate Cache
+ WRMSRNS // Non-Serializing Write to Model Specific Register
+ X87 // FPU
+ XGETBV1 // Supports XGETBV with ECX = 1
+ XOP // Bulldozer XOP functions
+ XSAVE // XSAVE, XRESTOR, XSETBV, XGETBV
+ XSAVEC // Supports XSAVEC and the compacted form of XRSTOR.
+ XSAVEOPT // XSAVEOPT available
+ XSAVES // Supports XSAVES/XRSTORS and IA32_XSS
// ARM features:
AESARM // AES instructions
@@ -302,9 +314,11 @@ type CPUInfo struct {
L2 int // L2 Cache (per core or shared). Will be -1 if undetected
L3 int // L3 Cache (per core, per ccx or shared). Will be -1 if undetected
}
- SGX SGXSupport
- maxFunc uint32
- maxExFunc uint32
+ SGX SGXSupport
+ AMDMemEncryption AMDMemEncryptionSupport
+ AVX10Level uint8
+ maxFunc uint32
+ maxExFunc uint32
}
var cpuid func(op uint32) (eax, ebx, ecx, edx uint32)
@@ -1071,6 +1085,32 @@ func hasSGX(available, lc bool) (rval SGXSupport) {
return
}
+type AMDMemEncryptionSupport struct {
+ Available bool
+ CBitPossition uint32
+ NumVMPL uint32
+ PhysAddrReduction uint32
+ NumEntryptedGuests uint32
+ MinSevNoEsAsid uint32
+}
+
+func hasAMDMemEncryption(available bool) (rval AMDMemEncryptionSupport) {
+ rval.Available = available
+ if !available {
+ return
+ }
+
+ _, b, c, d := cpuidex(0x8000001f, 0)
+
+ rval.CBitPossition = b & 0x3f
+ rval.PhysAddrReduction = (b >> 6) & 0x3F
+ rval.NumVMPL = (b >> 12) & 0xf
+ rval.NumEntryptedGuests = c
+ rval.MinSevNoEsAsid = d
+
+ return
+}
+
func support() flagSet {
var fs flagSet
mfi := maxFunctionID()
@@ -1165,6 +1205,7 @@ func support() flagSet {
fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ)
fs.setIf(ecx&(1<<13) != 0, TME)
fs.setIf(ecx&(1<<25) != 0, CLDEMOTE)
+ fs.setIf(ecx&(1<<23) != 0, KEYLOCKER)
fs.setIf(ecx&(1<<27) != 0, MOVDIRI)
fs.setIf(ecx&(1<<28) != 0, MOVDIR64B)
fs.setIf(ecx&(1<<29) != 0, ENQCMD)
@@ -1202,6 +1243,8 @@ func support() flagSet {
fs.setIf(edx1&(1<<4) != 0, AVXVNNIINT8)
fs.setIf(edx1&(1<<5) != 0, AVXNECONVERT)
fs.setIf(edx1&(1<<14) != 0, PREFETCHI)
+ fs.setIf(edx1&(1<<19) != 0, AVX10)
+ fs.setIf(edx1&(1<<21) != 0, APX_F)
// Only detect AVX-512 features if XGETBV is supported
if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
@@ -1252,6 +1295,19 @@ func support() flagSet {
fs.setIf(edx&(1<<4) != 0, BHI_CTRL)
fs.setIf(edx&(1<<5) != 0, MCDT_NO)
+ // Add keylocker features.
+ if fs.inSet(KEYLOCKER) && mfi >= 0x19 {
+ _, ebx, _, _ := cpuidex(0x19, 0)
+ fs.setIf(ebx&5 == 5, KEYLOCKERW) // Bit 0 and 2 (1+4)
+ }
+
+ // Add AVX10 features.
+ if fs.inSet(AVX10) && mfi >= 0x24 {
+ _, ebx, _, _ := cpuidex(0x24, 0)
+ fs.setIf(ebx&(1<<16) != 0, AVX10_128)
+ fs.setIf(ebx&(1<<17) != 0, AVX10_256)
+ fs.setIf(ebx&(1<<18) != 0, AVX10_512)
+ }
}
// Processor Extended State Enumeration Sub-leaf (EAX = 0DH, ECX = 1)
@@ -1394,6 +1450,29 @@ func support() flagSet {
fs.setIf((a>>24)&1 == 1, VMSA_REGPROT)
}
+ if maxExtendedFunction() >= 0x80000021 && vend == AMD {
+ a, _, _, _ := cpuid(0x80000021)
+ fs.setIf((a>>31)&1 == 1, SRSO_MSR_FIX)
+ fs.setIf((a>>30)&1 == 1, SRSO_USER_KERNEL_NO)
+ fs.setIf((a>>29)&1 == 1, SRSO_NO)
+ fs.setIf((a>>28)&1 == 1, IBPB_BRTYPE)
+ fs.setIf((a>>27)&1 == 1, SBPB)
+ }
+
+ if mfi >= 0x20 {
+ // Microsoft has decided to purposefully hide the information
+ // of the guest TEE when VMs are being created using Hyper-V.
+ //
+ // This leads us to check for the Hyper-V cpuid features
+ // (0x4000000C), and then for the `ebx` value set.
+ //
+ // For Intel TDX, `ebx` is set as `0xbe3`, being 3 the part
+ // we're mostly interested about,according to:
+ // https://github.com/torvalds/linux/blob/d2f51b3516dade79269ff45eae2a7668ae711b25/arch/x86/include/asm/hyperv-tlfs.h#L169-L174
+ _, ebx, _, _ := cpuid(0x4000000C)
+ fs.setIf(ebx == 0xbe3, TDX_GUEST)
+ }
+
if mfi >= 0x21 {
// Intel Trusted Domain Extensions Guests have their own cpuid leaf (0x21).
_, ebx, ecx, edx := cpuid(0x21)
@@ -1404,6 +1483,14 @@ func support() flagSet {
return fs
}
+func (c *CPUInfo) supportAVX10() uint8 {
+ if c.maxFunc >= 0x24 && c.featureSet.inSet(AVX10) {
+ _, ebx, _, _ := cpuidex(0x24, 0)
+ return uint8(ebx)
+ }
+ return 0
+}
+
func valAsString(values ...uint32) []byte {
r := make([]byte, 4*len(values))
for i, v := range values {
diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go
index c946824ec..799b400c2 100644
--- a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go
+++ b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go
@@ -27,10 +27,12 @@ func addInfo(c *CPUInfo, safe bool) {
c.Family, c.Model, c.Stepping = familyModel()
c.featureSet = support()
c.SGX = hasSGX(c.featureSet.inSet(SGX), c.featureSet.inSet(SGXLC))
+ c.AMDMemEncryption = hasAMDMemEncryption(c.featureSet.inSet(SME) || c.featureSet.inSet(SEV))
c.ThreadsPerCore = threadsPerCore()
c.LogicalCores = logicalCores()
c.PhysicalCores = physicalCores()
c.VendorID, c.VendorString = vendorID()
+ c.AVX10Level = c.supportAVX10()
c.cacheSize()
c.frequencies()
}
diff --git a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go
index 024c706af..57a085a53 100644
--- a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go
+++ b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go
@@ -16,210 +16,222 @@ func _() {
_ = x[AMXFP16-6]
_ = x[AMXINT8-7]
_ = x[AMXTILE-8]
- _ = x[AVX-9]
- _ = x[AVX2-10]
- _ = x[AVX512BF16-11]
- _ = x[AVX512BITALG-12]
- _ = x[AVX512BW-13]
- _ = x[AVX512CD-14]
- _ = x[AVX512DQ-15]
- _ = x[AVX512ER-16]
- _ = x[AVX512F-17]
- _ = x[AVX512FP16-18]
- _ = x[AVX512IFMA-19]
- _ = x[AVX512PF-20]
- _ = x[AVX512VBMI-21]
- _ = x[AVX512VBMI2-22]
- _ = x[AVX512VL-23]
- _ = x[AVX512VNNI-24]
- _ = x[AVX512VP2INTERSECT-25]
- _ = x[AVX512VPOPCNTDQ-26]
- _ = x[AVXIFMA-27]
- _ = x[AVXNECONVERT-28]
- _ = x[AVXSLOW-29]
- _ = x[AVXVNNI-30]
- _ = x[AVXVNNIINT8-31]
- _ = x[BHI_CTRL-32]
- _ = x[BMI1-33]
- _ = x[BMI2-34]
- _ = x[CETIBT-35]
- _ = x[CETSS-36]
- _ = x[CLDEMOTE-37]
- _ = x[CLMUL-38]
- _ = x[CLZERO-39]
- _ = x[CMOV-40]
- _ = x[CMPCCXADD-41]
- _ = x[CMPSB_SCADBS_SHORT-42]
- _ = x[CMPXCHG8-43]
- _ = x[CPBOOST-44]
- _ = x[CPPC-45]
- _ = x[CX16-46]
- _ = x[EFER_LMSLE_UNS-47]
- _ = x[ENQCMD-48]
- _ = x[ERMS-49]
- _ = x[F16C-50]
- _ = x[FLUSH_L1D-51]
- _ = x[FMA3-52]
- _ = x[FMA4-53]
- _ = x[FP128-54]
- _ = x[FP256-55]
- _ = x[FSRM-56]
- _ = x[FXSR-57]
- _ = x[FXSROPT-58]
- _ = x[GFNI-59]
- _ = x[HLE-60]
- _ = x[HRESET-61]
- _ = x[HTT-62]
- _ = x[HWA-63]
- _ = x[HYBRID_CPU-64]
- _ = x[HYPERVISOR-65]
- _ = x[IA32_ARCH_CAP-66]
- _ = x[IA32_CORE_CAP-67]
- _ = x[IBPB-68]
- _ = x[IBRS-69]
- _ = x[IBRS_PREFERRED-70]
- _ = x[IBRS_PROVIDES_SMP-71]
- _ = x[IBS-72]
- _ = x[IBSBRNTRGT-73]
- _ = x[IBSFETCHSAM-74]
- _ = x[IBSFFV-75]
- _ = x[IBSOPCNT-76]
- _ = x[IBSOPCNTEXT-77]
- _ = x[IBSOPSAM-78]
- _ = x[IBSRDWROPCNT-79]
- _ = x[IBSRIPINVALIDCHK-80]
- _ = x[IBS_FETCH_CTLX-81]
- _ = x[IBS_OPDATA4-82]
- _ = x[IBS_OPFUSE-83]
- _ = x[IBS_PREVENTHOST-84]
- _ = x[IBS_ZEN4-85]
- _ = x[IDPRED_CTRL-86]
- _ = x[INT_WBINVD-87]
- _ = x[INVLPGB-88]
- _ = x[LAHF-89]
- _ = x[LAM-90]
- _ = x[LBRVIRT-91]
- _ = x[LZCNT-92]
- _ = x[MCAOVERFLOW-93]
- _ = x[MCDT_NO-94]
- _ = x[MCOMMIT-95]
- _ = x[MD_CLEAR-96]
- _ = x[MMX-97]
- _ = x[MMXEXT-98]
- _ = x[MOVBE-99]
- _ = x[MOVDIR64B-100]
- _ = x[MOVDIRI-101]
- _ = x[MOVSB_ZL-102]
- _ = x[MOVU-103]
- _ = x[MPX-104]
- _ = x[MSRIRC-105]
- _ = x[MSRLIST-106]
- _ = x[MSR_PAGEFLUSH-107]
- _ = x[NRIPS-108]
- _ = x[NX-109]
- _ = x[OSXSAVE-110]
- _ = x[PCONFIG-111]
- _ = x[POPCNT-112]
- _ = x[PPIN-113]
- _ = x[PREFETCHI-114]
- _ = x[PSFD-115]
- _ = x[RDPRU-116]
- _ = x[RDRAND-117]
- _ = x[RDSEED-118]
- _ = x[RDTSCP-119]
- _ = x[RRSBA_CTRL-120]
- _ = x[RTM-121]
- _ = x[RTM_ALWAYS_ABORT-122]
- _ = x[SERIALIZE-123]
- _ = x[SEV-124]
- _ = x[SEV_64BIT-125]
- _ = x[SEV_ALTERNATIVE-126]
- _ = x[SEV_DEBUGSWAP-127]
- _ = x[SEV_ES-128]
- _ = x[SEV_RESTRICTED-129]
- _ = x[SEV_SNP-130]
- _ = x[SGX-131]
- _ = x[SGXLC-132]
- _ = x[SHA-133]
- _ = x[SME-134]
- _ = x[SME_COHERENT-135]
- _ = x[SPEC_CTRL_SSBD-136]
- _ = x[SRBDS_CTRL-137]
- _ = x[SSE-138]
- _ = x[SSE2-139]
- _ = x[SSE3-140]
- _ = x[SSE4-141]
- _ = x[SSE42-142]
- _ = x[SSE4A-143]
- _ = x[SSSE3-144]
- _ = x[STIBP-145]
- _ = x[STIBP_ALWAYSON-146]
- _ = x[STOSB_SHORT-147]
- _ = x[SUCCOR-148]
- _ = x[SVM-149]
- _ = x[SVMDA-150]
- _ = x[SVMFBASID-151]
- _ = x[SVML-152]
- _ = x[SVMNP-153]
- _ = x[SVMPF-154]
- _ = x[SVMPFT-155]
- _ = x[SYSCALL-156]
- _ = x[SYSEE-157]
- _ = x[TBM-158]
- _ = x[TDX_GUEST-159]
- _ = x[TLB_FLUSH_NESTED-160]
- _ = x[TME-161]
- _ = x[TOPEXT-162]
- _ = x[TSCRATEMSR-163]
- _ = x[TSXLDTRK-164]
- _ = x[VAES-165]
- _ = x[VMCBCLEAN-166]
- _ = x[VMPL-167]
- _ = x[VMSA_REGPROT-168]
- _ = x[VMX-169]
- _ = x[VPCLMULQDQ-170]
- _ = x[VTE-171]
- _ = x[WAITPKG-172]
- _ = x[WBNOINVD-173]
- _ = x[WRMSRNS-174]
- _ = x[X87-175]
- _ = x[XGETBV1-176]
- _ = x[XOP-177]
- _ = x[XSAVE-178]
- _ = x[XSAVEC-179]
- _ = x[XSAVEOPT-180]
- _ = x[XSAVES-181]
- _ = x[AESARM-182]
- _ = x[ARMCPUID-183]
- _ = x[ASIMD-184]
- _ = x[ASIMDDP-185]
- _ = x[ASIMDHP-186]
- _ = x[ASIMDRDM-187]
- _ = x[ATOMICS-188]
- _ = x[CRC32-189]
- _ = x[DCPOP-190]
- _ = x[EVTSTRM-191]
- _ = x[FCMA-192]
- _ = x[FP-193]
- _ = x[FPHP-194]
- _ = x[GPA-195]
- _ = x[JSCVT-196]
- _ = x[LRCPC-197]
- _ = x[PMULL-198]
- _ = x[SHA1-199]
- _ = x[SHA2-200]
- _ = x[SHA3-201]
- _ = x[SHA512-202]
- _ = x[SM3-203]
- _ = x[SM4-204]
- _ = x[SVE-205]
- _ = x[lastID-206]
+ _ = x[APX_F-9]
+ _ = x[AVX-10]
+ _ = x[AVX10-11]
+ _ = x[AVX10_128-12]
+ _ = x[AVX10_256-13]
+ _ = x[AVX10_512-14]
+ _ = x[AVX2-15]
+ _ = x[AVX512BF16-16]
+ _ = x[AVX512BITALG-17]
+ _ = x[AVX512BW-18]
+ _ = x[AVX512CD-19]
+ _ = x[AVX512DQ-20]
+ _ = x[AVX512ER-21]
+ _ = x[AVX512F-22]
+ _ = x[AVX512FP16-23]
+ _ = x[AVX512IFMA-24]
+ _ = x[AVX512PF-25]
+ _ = x[AVX512VBMI-26]
+ _ = x[AVX512VBMI2-27]
+ _ = x[AVX512VL-28]
+ _ = x[AVX512VNNI-29]
+ _ = x[AVX512VP2INTERSECT-30]
+ _ = x[AVX512VPOPCNTDQ-31]
+ _ = x[AVXIFMA-32]
+ _ = x[AVXNECONVERT-33]
+ _ = x[AVXSLOW-34]
+ _ = x[AVXVNNI-35]
+ _ = x[AVXVNNIINT8-36]
+ _ = x[BHI_CTRL-37]
+ _ = x[BMI1-38]
+ _ = x[BMI2-39]
+ _ = x[CETIBT-40]
+ _ = x[CETSS-41]
+ _ = x[CLDEMOTE-42]
+ _ = x[CLMUL-43]
+ _ = x[CLZERO-44]
+ _ = x[CMOV-45]
+ _ = x[CMPCCXADD-46]
+ _ = x[CMPSB_SCADBS_SHORT-47]
+ _ = x[CMPXCHG8-48]
+ _ = x[CPBOOST-49]
+ _ = x[CPPC-50]
+ _ = x[CX16-51]
+ _ = x[EFER_LMSLE_UNS-52]
+ _ = x[ENQCMD-53]
+ _ = x[ERMS-54]
+ _ = x[F16C-55]
+ _ = x[FLUSH_L1D-56]
+ _ = x[FMA3-57]
+ _ = x[FMA4-58]
+ _ = x[FP128-59]
+ _ = x[FP256-60]
+ _ = x[FSRM-61]
+ _ = x[FXSR-62]
+ _ = x[FXSROPT-63]
+ _ = x[GFNI-64]
+ _ = x[HLE-65]
+ _ = x[HRESET-66]
+ _ = x[HTT-67]
+ _ = x[HWA-68]
+ _ = x[HYBRID_CPU-69]
+ _ = x[HYPERVISOR-70]
+ _ = x[IA32_ARCH_CAP-71]
+ _ = x[IA32_CORE_CAP-72]
+ _ = x[IBPB-73]
+ _ = x[IBPB_BRTYPE-74]
+ _ = x[IBRS-75]
+ _ = x[IBRS_PREFERRED-76]
+ _ = x[IBRS_PROVIDES_SMP-77]
+ _ = x[IBS-78]
+ _ = x[IBSBRNTRGT-79]
+ _ = x[IBSFETCHSAM-80]
+ _ = x[IBSFFV-81]
+ _ = x[IBSOPCNT-82]
+ _ = x[IBSOPCNTEXT-83]
+ _ = x[IBSOPSAM-84]
+ _ = x[IBSRDWROPCNT-85]
+ _ = x[IBSRIPINVALIDCHK-86]
+ _ = x[IBS_FETCH_CTLX-87]
+ _ = x[IBS_OPDATA4-88]
+ _ = x[IBS_OPFUSE-89]
+ _ = x[IBS_PREVENTHOST-90]
+ _ = x[IBS_ZEN4-91]
+ _ = x[IDPRED_CTRL-92]
+ _ = x[INT_WBINVD-93]
+ _ = x[INVLPGB-94]
+ _ = x[KEYLOCKER-95]
+ _ = x[KEYLOCKERW-96]
+ _ = x[LAHF-97]
+ _ = x[LAM-98]
+ _ = x[LBRVIRT-99]
+ _ = x[LZCNT-100]
+ _ = x[MCAOVERFLOW-101]
+ _ = x[MCDT_NO-102]
+ _ = x[MCOMMIT-103]
+ _ = x[MD_CLEAR-104]
+ _ = x[MMX-105]
+ _ = x[MMXEXT-106]
+ _ = x[MOVBE-107]
+ _ = x[MOVDIR64B-108]
+ _ = x[MOVDIRI-109]
+ _ = x[MOVSB_ZL-110]
+ _ = x[MOVU-111]
+ _ = x[MPX-112]
+ _ = x[MSRIRC-113]
+ _ = x[MSRLIST-114]
+ _ = x[MSR_PAGEFLUSH-115]
+ _ = x[NRIPS-116]
+ _ = x[NX-117]
+ _ = x[OSXSAVE-118]
+ _ = x[PCONFIG-119]
+ _ = x[POPCNT-120]
+ _ = x[PPIN-121]
+ _ = x[PREFETCHI-122]
+ _ = x[PSFD-123]
+ _ = x[RDPRU-124]
+ _ = x[RDRAND-125]
+ _ = x[RDSEED-126]
+ _ = x[RDTSCP-127]
+ _ = x[RRSBA_CTRL-128]
+ _ = x[RTM-129]
+ _ = x[RTM_ALWAYS_ABORT-130]
+ _ = x[SBPB-131]
+ _ = x[SERIALIZE-132]
+ _ = x[SEV-133]
+ _ = x[SEV_64BIT-134]
+ _ = x[SEV_ALTERNATIVE-135]
+ _ = x[SEV_DEBUGSWAP-136]
+ _ = x[SEV_ES-137]
+ _ = x[SEV_RESTRICTED-138]
+ _ = x[SEV_SNP-139]
+ _ = x[SGX-140]
+ _ = x[SGXLC-141]
+ _ = x[SHA-142]
+ _ = x[SME-143]
+ _ = x[SME_COHERENT-144]
+ _ = x[SPEC_CTRL_SSBD-145]
+ _ = x[SRBDS_CTRL-146]
+ _ = x[SRSO_MSR_FIX-147]
+ _ = x[SRSO_NO-148]
+ _ = x[SRSO_USER_KERNEL_NO-149]
+ _ = x[SSE-150]
+ _ = x[SSE2-151]
+ _ = x[SSE3-152]
+ _ = x[SSE4-153]
+ _ = x[SSE42-154]
+ _ = x[SSE4A-155]
+ _ = x[SSSE3-156]
+ _ = x[STIBP-157]
+ _ = x[STIBP_ALWAYSON-158]
+ _ = x[STOSB_SHORT-159]
+ _ = x[SUCCOR-160]
+ _ = x[SVM-161]
+ _ = x[SVMDA-162]
+ _ = x[SVMFBASID-163]
+ _ = x[SVML-164]
+ _ = x[SVMNP-165]
+ _ = x[SVMPF-166]
+ _ = x[SVMPFT-167]
+ _ = x[SYSCALL-168]
+ _ = x[SYSEE-169]
+ _ = x[TBM-170]
+ _ = x[TDX_GUEST-171]
+ _ = x[TLB_FLUSH_NESTED-172]
+ _ = x[TME-173]
+ _ = x[TOPEXT-174]
+ _ = x[TSCRATEMSR-175]
+ _ = x[TSXLDTRK-176]
+ _ = x[VAES-177]
+ _ = x[VMCBCLEAN-178]
+ _ = x[VMPL-179]
+ _ = x[VMSA_REGPROT-180]
+ _ = x[VMX-181]
+ _ = x[VPCLMULQDQ-182]
+ _ = x[VTE-183]
+ _ = x[WAITPKG-184]
+ _ = x[WBNOINVD-185]
+ _ = x[WRMSRNS-186]
+ _ = x[X87-187]
+ _ = x[XGETBV1-188]
+ _ = x[XOP-189]
+ _ = x[XSAVE-190]
+ _ = x[XSAVEC-191]
+ _ = x[XSAVEOPT-192]
+ _ = x[XSAVES-193]
+ _ = x[AESARM-194]
+ _ = x[ARMCPUID-195]
+ _ = x[ASIMD-196]
+ _ = x[ASIMDDP-197]
+ _ = x[ASIMDHP-198]
+ _ = x[ASIMDRDM-199]
+ _ = x[ATOMICS-200]
+ _ = x[CRC32-201]
+ _ = x[DCPOP-202]
+ _ = x[EVTSTRM-203]
+ _ = x[FCMA-204]
+ _ = x[FP-205]
+ _ = x[FPHP-206]
+ _ = x[GPA-207]
+ _ = x[JSCVT-208]
+ _ = x[LRCPC-209]
+ _ = x[PMULL-210]
+ _ = x[SHA1-211]
+ _ = x[SHA2-212]
+ _ = x[SHA3-213]
+ _ = x[SHA512-214]
+ _ = x[SM3-215]
+ _ = x[SM4-216]
+ _ = x[SVE-217]
+ _ = x[lastID-218]
_ = x[firstID-0]
}
-const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXFP16AMXINT8AMXTILEAVXAVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXIFMAAVXNECONVERTAVXSLOWAVXVNNIAVXVNNIINT8BHI_CTRLBMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPCCXADDCMPSB_SCADBS_SHORTCMPXCHG8CPBOOSTCPPCCX16EFER_LMSLE_UNSENQCMDERMSF16CFLUSH_L1DFMA3FMA4FP128FP256FSRMFXSRFXSROPTGFNIHLEHRESETHTTHWAHYBRID_CPUHYPERVISORIA32_ARCH_CAPIA32_CORE_CAPIBPBIBRSIBRS_PREFERREDIBRS_PROVIDES_SMPIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKIBS_FETCH_CTLXIBS_OPDATA4IBS_OPFUSEIBS_PREVENTHOSTIBS_ZEN4IDPRED_CTRLINT_WBINVDINVLPGBLAHFLAMLBRVIRTLZCNTMCAOVERFLOWMCDT_NOMCOMMITMD_CLEARMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMOVSB_ZLMOVUMPXMSRIRCMSRLISTMSR_PAGEFLUSHNRIPSNXOSXSAVEPCONFIGPOPCNTPPINPREFETCHIPSFDRDPRURDRANDRDSEEDRDTSCPRRSBA_CTRLRTMRTM_ALWAYS_ABORTSERIALIZESEVSEV_64BITSEV_ALTERNATIVESEV_DEBUGSWAPSEV_ESSEV_RESTRICTEDSEV_SNPSGXSGXLCSHASMESME_COHERENTSPEC_CTRL_SSBDSRBDS_CTRLSSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSTIBP_ALWAYSONSTOSB_SHORTSUCCORSVMSVMDASVMFBASIDSVMLSVMNPSVMPFSVMPFTSYSCALLSYSEETBMTDX_GUESTTLB_FLUSH_NESTEDTMETOPEXTTSCRATEMSRTSXLDTRKVAESVMCBCLEANVMPLVMSA_REGPROTVMXVPCLMULQDQVTEWAITPKGWBNOINVDWRMSRNSX87XGETBV1XOPXSAVEXSAVECXSAVEOPTXSAVESAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID"
+const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXFP16AMXINT8AMXTILEAPX_FAVXAVX10AVX10_128AVX10_256AVX10_512AVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXIFMAAVXNECONVERTAVXSLOWAVXVNNIAVXVNNIINT8BHI_CTRLBMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPCCXADDCMPSB_SCADBS_SHORTCMPXCHG8CPBOOSTCPPCCX16EFER_LMSLE_UNSENQCMDERMSF16CFLUSH_L1DFMA3FMA4FP128FP256FSRMFXSRFXSROPTGFNIHLEHRESETHTTHWAHYBRID_CPUHYPERVISORIA32_ARCH_CAPIA32_CORE_CAPIBPBIBPB_BRTYPEIBRSIBRS_PREFERREDIBRS_PROVIDES_SMPIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKIBS_FETCH_CTLXIBS_OPDATA4IBS_OPFUSEIBS_PREVENTHOSTIBS_ZEN4IDPRED_CTRLINT_WBINVDINVLPGBKEYLOCKERKEYLOCKERWLAHFLAMLBRVIRTLZCNTMCAOVERFLOWMCDT_NOMCOMMITMD_CLEARMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMOVSB_ZLMOVUMPXMSRIRCMSRLISTMSR_PAGEFLUSHNRIPSNXOSXSAVEPCONFIGPOPCNTPPINPREFETCHIPSFDRDPRURDRANDRDSEEDRDTSCPRRSBA_CTRLRTMRTM_ALWAYS_ABORTSBPBSERIALIZESEVSEV_64BITSEV_ALTERNATIVESEV_DEBUGSWAPSEV_ESSEV_RESTRICTEDSEV_SNPSGXSGXLCSHASMESME_COHERENTSPEC_CTRL_SSBDSRBDS_CTRLSRSO_MSR_FIXSRSO_NOSRSO_USER_KERNEL_NOSSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSTIBP_ALWAYSONSTOSB_SHORTSUCCORSVMSVMDASVMFBASIDSVMLSVMNPSVMPFSVMPFTSYSCALLSYSEETBMTDX_GUESTTLB_FLUSH_NESTEDTMETOPEXTTSCRATEMSRTSXLDTRKVAESVMCBCLEANVMPLVMSA_REGPROTVMXVPCLMULQDQVTEWAITPKGWBNOINVDWRMSRNSX87XGETBV1XOPXSAVEXSAVECXSAVEOPTXSAVESAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID"
-var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 62, 65, 69, 79, 91, 99, 107, 115, 123, 130, 140, 150, 158, 168, 179, 187, 197, 215, 230, 237, 249, 256, 263, 274, 282, 286, 290, 296, 301, 309, 314, 320, 324, 333, 351, 359, 366, 370, 374, 388, 394, 398, 402, 411, 415, 419, 424, 429, 433, 437, 444, 448, 451, 457, 460, 463, 473, 483, 496, 509, 513, 517, 531, 548, 551, 561, 572, 578, 586, 597, 605, 617, 633, 647, 658, 668, 683, 691, 702, 712, 719, 723, 726, 733, 738, 749, 756, 763, 771, 774, 780, 785, 794, 801, 809, 813, 816, 822, 829, 842, 847, 849, 856, 863, 869, 873, 882, 886, 891, 897, 903, 909, 919, 922, 938, 947, 950, 959, 974, 987, 993, 1007, 1014, 1017, 1022, 1025, 1028, 1040, 1054, 1064, 1067, 1071, 1075, 1079, 1084, 1089, 1094, 1099, 1113, 1124, 1130, 1133, 1138, 1147, 1151, 1156, 1161, 1167, 1174, 1179, 1182, 1191, 1207, 1210, 1216, 1226, 1234, 1238, 1247, 1251, 1263, 1266, 1276, 1279, 1286, 1294, 1301, 1304, 1311, 1314, 1319, 1325, 1333, 1339, 1345, 1353, 1358, 1365, 1372, 1380, 1387, 1392, 1397, 1404, 1408, 1410, 1414, 1417, 1422, 1427, 1432, 1436, 1440, 1444, 1450, 1453, 1456, 1459, 1465}
+var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 62, 67, 70, 75, 84, 93, 102, 106, 116, 128, 136, 144, 152, 160, 167, 177, 187, 195, 205, 216, 224, 234, 252, 267, 274, 286, 293, 300, 311, 319, 323, 327, 333, 338, 346, 351, 357, 361, 370, 388, 396, 403, 407, 411, 425, 431, 435, 439, 448, 452, 456, 461, 466, 470, 474, 481, 485, 488, 494, 497, 500, 510, 520, 533, 546, 550, 561, 565, 579, 596, 599, 609, 620, 626, 634, 645, 653, 665, 681, 695, 706, 716, 731, 739, 750, 760, 767, 776, 786, 790, 793, 800, 805, 816, 823, 830, 838, 841, 847, 852, 861, 868, 876, 880, 883, 889, 896, 909, 914, 916, 923, 930, 936, 940, 949, 953, 958, 964, 970, 976, 986, 989, 1005, 1009, 1018, 1021, 1030, 1045, 1058, 1064, 1078, 1085, 1088, 1093, 1096, 1099, 1111, 1125, 1135, 1147, 1154, 1173, 1176, 1180, 1184, 1188, 1193, 1198, 1203, 1208, 1222, 1233, 1239, 1242, 1247, 1256, 1260, 1265, 1270, 1276, 1283, 1288, 1291, 1300, 1316, 1319, 1325, 1335, 1343, 1347, 1356, 1360, 1372, 1375, 1385, 1388, 1395, 1403, 1410, 1413, 1420, 1423, 1428, 1434, 1442, 1448, 1454, 1462, 1467, 1474, 1481, 1489, 1496, 1501, 1506, 1513, 1517, 1519, 1523, 1526, 1531, 1536, 1541, 1545, 1549, 1553, 1559, 1562, 1565, 1568, 1574}
func (i FeatureID) String() string {
if i < 0 || i >= FeatureID(len(_FeatureID_index)-1) {
diff --git a/vendor/github.com/klauspost/reedsolomon/.gitignore b/vendor/github.com/klauspost/reedsolomon/.gitignore
new file mode 100644
index 000000000..59610b561
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/.gitignore
@@ -0,0 +1,26 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
+
+.idea
\ No newline at end of file
diff --git a/vendor/github.com/klauspost/reedsolomon/LICENSE b/vendor/github.com/klauspost/reedsolomon/LICENSE
new file mode 100644
index 000000000..a947e162b
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/LICENSE
@@ -0,0 +1,23 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+Copyright (c) 2015 Backblaze
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/vendor/github.com/klauspost/reedsolomon/README.md b/vendor/github.com/klauspost/reedsolomon/README.md
new file mode 100644
index 000000000..bdcb9e787
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/README.md
@@ -0,0 +1,566 @@
+# Reed-Solomon
+[![Go Reference](https://pkg.go.dev/badge/github.com/klauspost/reedsolomon.svg)](https://pkg.go.dev/github.com/klauspost/reedsolomon) [![Go](https://github.com/klauspost/reedsolomon/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/reedsolomon/actions/workflows/go.yml)
+
+Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go.
+
+This is a Go port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by
+[Backblaze](http://backblaze.com), with some additional optimizations.
+
+For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/).
+
+For encoding high shard counts (>256) a Leopard implementation is used.
+For most platforms this performs close to the original Leopard implementation in terms of speed.
+
+Package home: https://github.com/klauspost/reedsolomon
+
+Godoc: https://pkg.go.dev/github.com/klauspost/reedsolomon
+
+# Installation
+To get the package use the standard:
+```bash
+go get -u github.com/klauspost/reedsolomon
+```
+
+Using Go modules is recommended.
+
+# Changes
+
+## 2022
+
+* [GFNI](https://github.com/klauspost/reedsolomon/pull/224) support for amd64, for up to 3x faster processing.
+* [Leopard GF8](https://github.com/klauspost/reedsolomon#leopard-gf8) mode added, for faster processing of medium shard counts.
+* [Leopard GF16](https://github.com/klauspost/reedsolomon#leopard-compatible-gf16) mode added, for up to 65536 shards.
+* [WithJerasureMatrix](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithJerasureMatrix) allows constructing a [Jerasure](https://github.com/tsuraan/Jerasure) compatible matrix.
+
+## 2021
+
+* Use `GOAMD64=v4` to enable faster AVX2.
+* Add progressive shard encoding.
+* Wider AVX2 loops
+* Limit concurrency on AVX2, since we are likely memory bound.
+* Allow 0 parity shards.
+* Allow disabling inversion cache.
+* Faster AVX2 encoding.
+
+
+ See older changes
+
+## May 2020
+
+* ARM64 optimizations, up to 2.5x faster.
+* Added [WithFastOneParityMatrix](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithFastOneParityMatrix) for faster operation with 1 parity shard.
+* Much better performance when using a limited number of goroutines.
+* AVX512 is now using multiple cores.
+* Stream processing overhaul, big speedups in most cases.
+* AVX512 optimizations
+
+## March 6, 2019
+
+The pure Go implementation is about 30% faster. Minor tweaks to assembler implementations.
+
+## February 8, 2019
+
+AVX512 accelerated version added for Intel Skylake CPUs. This can give up to a 4x speed improvement as compared to AVX2.
+See [here](https://github.com/klauspost/reedsolomon#performance-on-avx512) for more details.
+
+## December 18, 2018
+
+Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform.
+
+## November 18, 2017
+
+Added [WithAutoGoroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithAutoGoroutines) which will attempt
+to calculate the optimal number of goroutines to use based on your expected shard size and detected CPU.
+
+## October 1, 2017
+
+* [Cauchy Matrix](https://godoc.org/github.com/klauspost/reedsolomon#WithCauchyMatrix) is now an option.
+Thanks to [templexxx](https://github.com/templexxx) for the basis of this.
+
+* Default maximum number of [goroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithMaxGoroutines)
+has been increased for better multi-core scaling.
+
+* After several requests the Reconstruct and ReconstructData now slices of zero length but sufficient capacity to
+be used instead of allocating new memory.
+
+## August 26, 2017
+
+* The [`Encoder()`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) now contains an `Update`
+function contributed by [chenzhongtao](https://github.com/chenzhongtao).
+
+* [Frank Wessels](https://github.com/fwessels) kindly contributed ARM 64 bit assembly,
+which gives a huge performance boost on this platform.
+
+## July 20, 2017
+
+`ReconstructData` added to [`Encoder`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) interface.
+This can cause compatibility issues if you implement your own Encoder. A simple workaround can be added:
+
+```Go
+func (e *YourEnc) ReconstructData(shards [][]byte) error {
+ return ReconstructData(shards)
+}
+```
+
+You can of course also do your own implementation.
+The [`StreamEncoder`](https://godoc.org/github.com/klauspost/reedsolomon#StreamEncoder)
+handles this without modifying the interface.
+This is a good lesson on why returning interfaces is not a good design.
+
+
+
+# Usage
+
+This section assumes you know the basics of Reed-Solomon encoding.
+A good start is this [Backblaze blog post](https://www.backblaze.com/blog/reed-solomon/).
+
+This package performs the calculation of the parity sets. The usage is therefore relatively simple.
+
+First of all, you need to choose your distribution of data and parity shards.
+A 'good' distribution is very subjective, and will depend a lot on your usage scenario.
+
+To create an encoder with 10 data shards (where your data goes) and 3 parity shards (calculated):
+```Go
+ enc, err := reedsolomon.New(10, 3)
+```
+This encoder will work for all parity sets with this distribution of data and parity shards.
+
+If you will primarily be using it with one shard size it is recommended to use
+[`WithAutoGoroutines(shardSize)`](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithAutoGoroutines)
+as an additional parameter. This will attempt to calculate the optimal number of goroutines to use for the best speed.
+It is not required that all shards are this size.
+
+Then you send and receive data that is a simple slice of byte slices; `[][]byte`.
+In the example above, the top slice must have a length of 13.
+
+```Go
+ data := make([][]byte, 13)
+```
+You should then fill the 10 first slices with *equally sized* data,
+and create parity shards that will be populated with parity data. In this case we create the data in memory,
+but you could for instance also use [mmap](https://github.com/edsrzf/mmap-go) to map files.
+
+```Go
+ // Create all shards, size them at 50000 each
+ for i := range input {
+ data[i] := make([]byte, 50000)
+ }
+
+ // The above allocations can also be done by the encoder:
+ // data := enc.(reedsolomon.Extended).AllocAligned(50000)
+
+ // Fill some data into the data shards
+ for i, in := range data[:10] {
+ for j:= range in {
+ in[j] = byte((i+j)&0xff)
+ }
+ }
+```
+
+To populate the parity shards, you simply call `Encode()` with your data.
+```Go
+ err = enc.Encode(data)
+```
+The only cases where you should get an error is, if the data shards aren't of equal size.
+The last 3 shards now contain parity data. You can verify this by calling `Verify()`:
+
+```Go
+ ok, err = enc.Verify(data)
+```
+
+The final (and important) part is to be able to reconstruct missing shards.
+For this to work, you need to know which parts of your data is missing.
+The encoder *does not know which parts are invalid*, so if data corruption is a likely scenario,
+you need to implement a hash check for each shard.
+
+If a byte has changed in your set, and you don't know which it is, there is no way to reconstruct the data set.
+
+To indicate missing data, you set the shard to nil before calling `Reconstruct()`:
+
+```Go
+ // Delete two data shards
+ data[3] = nil
+ data[7] = nil
+
+ // Reconstruct the missing shards
+ err := enc.Reconstruct(data)
+```
+The missing data and parity shards will be recreated. If more than 3 shards are missing, the reconstruction will fail.
+
+If you are only interested in the data shards (for reading purposes) you can call `ReconstructData()`:
+
+```Go
+ // Delete two data shards
+ data[3] = nil
+ data[7] = nil
+
+ // Reconstruct just the missing data shards
+ err := enc.ReconstructData(data)
+```
+
+If you don't need all data shards you can use `ReconstructSome()`:
+
+```Go
+ // Delete two data shards
+ data[3] = nil
+ data[7] = nil
+
+ // Reconstruct just the shard 3
+ err := enc.ReconstructSome(data, []bool{false, false, false, true, false, false, false, false})
+```
+
+So to sum up reconstruction:
+* The number of data/parity shards must match the numbers used for encoding.
+* The order of shards must be the same as used when encoding.
+* You may only supply data you know is valid.
+* Invalid shards should be set to nil.
+
+For complete examples of an encoder and decoder see the
+[examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples).
+
+# Splitting/Joining Data
+
+You might have a large slice of data.
+To help you split this, there are some helper functions that can split and join a single byte slice.
+
+```Go
+ bigfile, _ := ioutil.Readfile("myfile.data")
+
+ // Split the file
+ split, err := enc.Split(bigfile)
+```
+This will split the file into the number of data shards set when creating the encoder and create empty parity shards.
+
+An important thing to note is that you have to *keep track of the exact input size*.
+If the size of the input isn't divisible by the number of data shards, extra zeros will be inserted in the last shard.
+
+To join a data set, use the `Join()` function, which will join the shards and write it to the `io.Writer` you supply:
+```Go
+ // Join a data set and write it to io.Discard.
+ err = enc.Join(io.Discard, data, len(bigfile))
+```
+
+## Aligned Allocations
+
+For AMD64 aligned inputs can make a big speed difference.
+
+This is an example of the speed difference when inputs are unaligned/aligned:
+
+```
+BenchmarkEncode100x20x10000-32 7058 172648 ns/op 6950.57 MB/s
+BenchmarkEncode100x20x10000-32 8406 137911 ns/op 8701.24 MB/s
+```
+
+This is mostly the case when dealing with odd-sized shards.
+
+To facilitate this the package provides an `AllocAligned(shards, each int) [][]byte`.
+This will allocate a number of shards, each with the size `each`.
+Each shard will then be aligned to a 64 byte boundary.
+
+Each encoder also has a `AllocAligned(each int) [][]byte` as an extended interface which will return the same,
+but with the shard count configured in the encoder.
+
+It is not possible to re-aligned already allocated slices, for example when using `Split`.
+When it is not possible to write to aligned shards, you should not copy to them.
+
+# Progressive encoding
+
+It is possible to encode individual shards using EncodeIdx:
+
+```Go
+ // EncodeIdx will add parity for a single data shard.
+ // Parity shards should start out as 0. The caller must zero them.
+ // Data shards must be delivered exactly once. There is no check for this.
+ // The parity shards will always be updated and the data shards will remain the same.
+ EncodeIdx(dataShard []byte, idx int, parity [][]byte) error
+```
+
+This allows progressively encoding the parity by sending individual data shards.
+There is no requirement on shards being delivered in order,
+but when sent in order it allows encoding shards one at the time,
+effectively allowing the operation to be streaming.
+
+The result will be the same as encoding all shards at once.
+There is a minor speed penalty using this method, so send
+shards at once if they are available.
+
+## Example
+
+```Go
+func test() {
+ // Create an encoder with 7 data and 3 parity slices.
+ enc, _ := reedsolomon.New(7, 3)
+
+ // This will be our output parity.
+ parity := make([][]byte, 3)
+ for i := range parity {
+ parity[i] = make([]byte, 10000)
+ }
+
+ for i := 0; i < 7; i++ {
+ // Send data shards one at the time.
+ _ = enc.EncodeIdx(make([]byte, 10000), i, parity)
+ }
+
+ // parity now contains parity, as if all data was sent in one call.
+}
+```
+
+# Streaming/Merging
+
+It might seem like a limitation that all data should be in memory,
+but an important property is that *as long as the number of data/parity shards are the same,
+you can merge/split data sets*, and they will remain valid as a separate set.
+
+```Go
+ // Split the data set of 50000 elements into two of 25000
+ splitA := make([][]byte, 13)
+ splitB := make([][]byte, 13)
+
+ // Merge into a 100000 element set
+ merged := make([][]byte, 13)
+
+ for i := range data {
+ splitA[i] = data[i][:25000]
+ splitB[i] = data[i][25000:]
+
+ // Concatenate it to itself
+ merged[i] = append(make([]byte, 0, len(data[i])*2), data[i]...)
+ merged[i] = append(merged[i], data[i]...)
+ }
+
+ // Each part should still verify as ok.
+ ok, err := enc.Verify(splitA)
+ if ok && err == nil {
+ log.Println("splitA ok")
+ }
+
+ ok, err = enc.Verify(splitB)
+ if ok && err == nil {
+ log.Println("splitB ok")
+ }
+
+ ok, err = enc.Verify(merge)
+ if ok && err == nil {
+ log.Println("merge ok")
+ }
+```
+
+This means that if you have a data set that may not fit into memory, you can split processing into smaller blocks.
+For the best throughput, don't use too small blocks.
+
+This also means that you can divide big input up into smaller blocks, and do reconstruction on parts of your data.
+This doesn't give the same flexibility of a higher number of data shards, but it will be much more performant.
+
+# Streaming API
+
+There has been added support for a streaming API, to help perform fully streaming operations,
+which enables you to do the same operations, but on streams.
+To use the stream API, use [`NewStream`](https://godoc.org/github.com/klauspost/reedsolomon#NewStream) function
+to create the encoding/decoding interfaces.
+
+You can use [`WithConcurrentStreams`](https://godoc.org/github.com/klauspost/reedsolomon#WithConcurrentStreams)
+to ready an interface that reads/writes concurrently from the streams.
+
+You can specify the size of each operation using
+[`WithStreamBlockSize`](https://godoc.org/github.com/klauspost/reedsolomon#WithStreamBlockSize).
+This will set the size of each read/write operation.
+
+Input is delivered as `[]io.Reader`, output as `[]io.Writer`, and functionality corresponds to the in-memory API.
+Each stream must supply the same amount of data, similar to how each slice must be similar size with the in-memory API.
+If an error occurs in relation to a stream,
+a [`StreamReadError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamReadError)
+or [`StreamWriteError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamWriteError)
+will help you determine which stream was the offender.
+
+There is no buffering or timeouts/retry specified. If you want to add that, you need to add it to the Reader/Writer.
+
+For complete examples of a streaming encoder and decoder see the
+[examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples).
+
+GF16 (more than 256 shards) is not supported by the streaming interface.
+
+# Advanced Options
+
+You can modify internal options which affects how jobs are split between and processed by goroutines.
+
+To create options, use the WithXXX functions. You can supply options to `New`, `NewStream`.
+If no Options are supplied, default options are used.
+
+Example of how to supply options:
+
+ ```Go
+ enc, err := reedsolomon.New(10, 3, WithMaxGoroutines(25))
+ ```
+
+# Leopard Compatible GF16
+
+When you encode more than 256 shards the library will switch to a [Leopard-RS](https://github.com/catid/leopard) implementation.
+
+This allows encoding up to 65536 shards (data+parity) with the following limitations, similar to leopard:
+
+* The original and recovery data must not exceed 65536 pieces.
+* The shard size *must* each be a multiple of 64 bytes.
+* Each buffer should have the same number of bytes.
+* Even the last shard must be rounded up to the block size.
+
+| | Regular | Leopard |
+|-----------------|---------|---------|
+| Encode | ✓ | ✓ |
+| EncodeIdx | ✓ | - |
+| Verify | ✓ | ✓ |
+| Reconstruct | ✓ | ✓ |
+| ReconstructData | ✓ | ✓ |
+| ReconstructSome | ✓ | ✓ (+) |
+| Update | ✓ | - |
+| Split | ✓ | ✓ |
+| Join | ✓ | ✓ |
+
+* (+) Same as calling `ReconstructData`.
+
+The Split/Join functions will help to split an input to the proper sizes.
+
+Speed can be expected to be `O(N*log(N))`, compared to the `O(N*N)`.
+Reconstruction matrix calculation is more time-consuming,
+so be sure to include that as part of any benchmark you run.
+
+For now SSSE3, AVX2 and AVX512 assembly are available on AMD64 platforms.
+
+Leopard mode currently always runs as a single goroutine, since multiple
+goroutines doesn't provide any worthwhile speedup.
+
+## Leopard GF8
+
+It is possible to replace the default reed-solomon encoder with a leopard compatible one.
+This will typically be faster when dealing with more than 20-30 shards.
+Note that the limitations listed above also applies to this mode.
+See table below for speed with different number of shards.
+
+To enable Leopard GF8 mode use `WithLeopardGF(true)`.
+
+Benchmark Encoding and Reconstructing *1KB* shards with variable number of shards.
+All implementation use inversion cache when available.
+Speed is total shard size for each operation. Data shard throughput is speed/2.
+AVX2 is used.
+
+| Encoder | Shards | Encode | Recover All | Recover One |
+|--------------|-------------|----------------|--------------|----------------|
+| Cauchy | 4+4 | 23076.83 MB/s | 5444.02 MB/s | 10834.67 MB/s |
+| Cauchy | 8+8 | 15206.87 MB/s | 4223.42 MB/s | 16181.62 MB/s |
+| Cauchy | 16+16 | 7427.47 MB/s | 3305.84 MB/s | 22480.41 MB/s |
+| Cauchy | 32+32 | 3785.64 MB/s | 2300.07 MB/s | 26181.31 MB/s |
+| Cauchy | 64+64 | 1911.93 MB/s | 1368.51 MB/s | 27992.93 MB/s |
+| Cauchy | 128+128 | 963.83 MB/s | 1327.56 MB/s | 32866.86 MB/s |
+| Leopard GF8 | 4+4 | 17061.28 MB/s | 3099.06 MB/s | 4096.78 MB/s |
+| Leopard GF8 | 8+8 | 10546.67 MB/s | 2925.92 MB/s | 3964.00 MB/s |
+| Leopard GF8 | 16+16 | 10961.37 MB/s | 2328.40 MB/s | 3110.22 MB/s |
+| Leopard GF8 | 32+32 | 7111.47 MB/s | 2374.61 MB/s | 3220.75 MB/s |
+| Leopard GF8 | 64+64 | 7468.57 MB/s | 2055.41 MB/s | 3061.81 MB/s |
+| Leopard GF8 | 128+128 | 5479.99 MB/s | 1953.21 MB/s | 2815.15 MB/s |
+| Leopard GF16 | 256+256 | 6158.66 MB/s | 454.14 MB/s | 506.70 MB/s |
+| Leopard GF16 | 512+512 | 4418.58 MB/s | 685.75 MB/s | 801.63 MB/s |
+| Leopard GF16 | 1024+1024 | 4778.05 MB/s | 814.51 MB/s | 1080.19 MB/s |
+| Leopard GF16 | 2048+2048 | 3417.05 MB/s | 911.64 MB/s | 1179.48 MB/s |
+| Leopard GF16 | 4096+4096 | 3209.41 MB/s | 729.13 MB/s | 1135.06 MB/s |
+| Leopard GF16 | 8192+8192 | 2034.11 MB/s | 604.52 MB/s | 842.13 MB/s |
+| Leopard GF16 | 16384+16384 | 1525.88 MB/s | 486.74 MB/s | 750.01 MB/s |
+| Leopard GF16 | 32768+32768 | 1138.67 MB/s | 482.81 MB/s | 712.73 MB/s |
+
+"Traditional" encoding is faster until somewhere between 16 and 32 shards.
+Leopard provides fast encoding in all cases, but shows a significant overhead for reconstruction.
+
+Calculating the reconstruction matrix takes a significant amount of computation.
+With bigger shards that will be smaller. Arguably, fewer shards typically also means bigger shards.
+Due to the high shard count caching reconstruction matrices generally isn't feasible for Leopard.
+
+# Performance
+
+Performance depends mainly on the number of parity shards.
+In rough terms, doubling the number of parity shards will double the encoding time.
+
+Here are the throughput numbers with some different selections of data and parity shards.
+For reference each shard is 1MB random data, and 16 CPU cores are used for encoding.
+
+| Data | Parity | Go MB/s | SSSE3 MB/s | AVX2 MB/s |
+|------|--------|---------|------------|-----------|
+| 5 | 2 | 20,772 | 66,355 | 108,755 |
+| 8 | 8 | 6,815 | 38,338 | 70,516 |
+| 10 | 4 | 9,245 | 48,237 | 93,875 |
+| 50 | 20 | 2,063 | 12,130 | 22,828 |
+
+The throughput numbers here is the size of the encoded data and parity shards.
+
+If `runtime.GOMAXPROCS()` is set to a value higher than 1,
+the encoder will use multiple goroutines to perform the calculations in `Verify`, `Encode` and `Reconstruct`.
+
+
+Benchmarking `Reconstruct()` followed by a `Verify()` (=`all`) versus just calling `ReconstructData()` (=`data`) gives the following result:
+```
+benchmark all MB/s data MB/s speedup
+BenchmarkReconstruct10x2x10000-8 2011.67 10530.10 5.23x
+BenchmarkReconstruct50x5x50000-8 4585.41 14301.60 3.12x
+BenchmarkReconstruct10x2x1M-8 8081.15 28216.41 3.49x
+BenchmarkReconstruct5x2x1M-8 5780.07 28015.37 4.85x
+BenchmarkReconstruct10x4x1M-8 4352.56 14367.61 3.30x
+BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x
+BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x
+```
+
+The package will use [GFNI](https://en.wikipedia.org/wiki/AVX-512#GFNI) instructions combined with AVX512 when these are available.
+This further improves speed by up to 3x over AVX2 code paths.
+
+## ARM64 NEON
+
+By exploiting NEON instructions the performance for ARM has been accelerated.
+Below are the performance numbers for a single core on an EC2 m6g.16xlarge (Graviton2) instance (Amazon Linux 2):
+
+```
+BenchmarkGalois128K-64 119562 10028 ns/op 13070.78 MB/s
+BenchmarkGalois1M-64 14380 83424 ns/op 12569.22 MB/s
+BenchmarkGaloisXor128K-64 96508 12432 ns/op 10543.29 MB/s
+BenchmarkGaloisXor1M-64 10000 100322 ns/op 10452.13 MB/s
+```
+
+# Performance on ppc64le
+
+The performance for ppc64le has been accelerated.
+This gives roughly a 10x performance improvement on this architecture as can be seen below:
+
+```
+benchmark old MB/s new MB/s speedup
+BenchmarkGalois128K-160 948.87 8878.85 9.36x
+BenchmarkGalois1M-160 968.85 9041.92 9.33x
+BenchmarkGaloisXor128K-160 862.02 7905.00 9.17x
+BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x
+```
+
+# Legal
+
+> None of section below is legal advice. Seek your own legal counsel.
+> As stated by the [LICENSE](LICENSE) the authors will not be held reliable for any use of this library.
+> Users are encouraged to independently verify they comply with all legal requirements.
+
+As can be seen in [recent news](https://www.datanami.com/2023/10/16/cloudera-hit-with-240-million-judgement-over-erasure-coding/)
+there has been lawsuits related to possible patents of aspects of erasure coding functionality.
+
+As a possible mitigation it is possible to use the tag `nopshufb` when compiling any code which includes this package.
+This will remove all inclusion and use of `PSHUFB` and equivalent on other platforms.
+
+This is done by adding `-tags=nopshufb` to `go build` and similar commands that produce binary output.
+
+The removed code may not be infringing and even after `-tags=nopshufb` there may still be infringing code left.
+
+# Links
+* [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/).
+* [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze.
+* [ocaml-reed-solomon-erasure](https://gitlab.com/darrenldl/ocaml-reed-solomon-erasure). Compatible OCaml implementation.
+* [reedsolomon-c](https://github.com/jannson/reedsolomon-c). C version, compatible with output from this package.
+* [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance.
+* [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation.
+* [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests.
+* [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations.
+* [Leopard-RS](https://github.com/catid/leopard) C library used as basis for GF16 implementation.
+
+# License
+
+This code, as the original [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) is published under an MIT license. See LICENSE file for more information.
diff --git a/vendor/github.com/klauspost/reedsolomon/galois.go b/vendor/github.com/klauspost/reedsolomon/galois.go
new file mode 100644
index 000000000..697f9ca67
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois.go
@@ -0,0 +1,979 @@
+/**
+ * 8-bit Galois Field
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc. All rights reserved.
+ */
+
+package reedsolomon
+
+import (
+ "encoding/binary"
+)
+
+const (
+ // The number of elements in the field.
+ fieldSize = 256
+
+ // The polynomial used to generate the logarithm table.
+ //
+ // There are a number of polynomials that work to generate
+ // a Galois field of 256 elements. The choice is arbitrary,
+ // and we just use the first one.
+ //
+ // The possibilities are: 29, 43, 45, 77, 95, 99, 101, 105,
+ //* 113, 135, 141, 169, 195, 207, 231, and 245.
+ generatingPolynomial = 29
+)
+
+var logTable = [fieldSize]byte{
+ 0, 0, 1, 25, 2, 50, 26, 198,
+ 3, 223, 51, 238, 27, 104, 199, 75,
+ 4, 100, 224, 14, 52, 141, 239, 129,
+ 28, 193, 105, 248, 200, 8, 76, 113,
+ 5, 138, 101, 47, 225, 36, 15, 33,
+ 53, 147, 142, 218, 240, 18, 130, 69,
+ 29, 181, 194, 125, 106, 39, 249, 185,
+ 201, 154, 9, 120, 77, 228, 114, 166,
+ 6, 191, 139, 98, 102, 221, 48, 253,
+ 226, 152, 37, 179, 16, 145, 34, 136,
+ 54, 208, 148, 206, 143, 150, 219, 189,
+ 241, 210, 19, 92, 131, 56, 70, 64,
+ 30, 66, 182, 163, 195, 72, 126, 110,
+ 107, 58, 40, 84, 250, 133, 186, 61,
+ 202, 94, 155, 159, 10, 21, 121, 43,
+ 78, 212, 229, 172, 115, 243, 167, 87,
+ 7, 112, 192, 247, 140, 128, 99, 13,
+ 103, 74, 222, 237, 49, 197, 254, 24,
+ 227, 165, 153, 119, 38, 184, 180, 124,
+ 17, 68, 146, 217, 35, 32, 137, 46,
+ 55, 63, 209, 91, 149, 188, 207, 205,
+ 144, 135, 151, 178, 220, 252, 190, 97,
+ 242, 86, 211, 171, 20, 42, 93, 158,
+ 132, 60, 57, 83, 71, 109, 65, 162,
+ 31, 45, 67, 216, 183, 123, 164, 118,
+ 196, 23, 73, 236, 127, 12, 111, 246,
+ 108, 161, 59, 82, 41, 157, 85, 170,
+ 251, 96, 134, 177, 187, 204, 62, 90,
+ 203, 89, 95, 176, 156, 169, 160, 81,
+ 11, 245, 22, 235, 122, 117, 44, 215,
+ 79, 174, 213, 233, 230, 231, 173, 232,
+ 116, 214, 244, 234, 168, 80, 88, 175,
+}
+
+/**
+ * Inverse of the logarithm table. Maps integer logarithms
+ * to members of the field. Entry 255 is the same as entry 0 sue to mod 255.
+ *
+ * This table was generated by `go run gentables.go`
+ * Table has been truncated to 256 bytes, since no lookups are bigger.
+ */
+var expTable = [256]byte{0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x3, 0x6, 0xc, 0x18, 0x30, 0x60, 0xc0, 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 0x46, 0x8c, 0x5, 0xa, 0x14, 0x28, 0x50, 0xa0, 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 0x65, 0xca, 0x89, 0xf, 0x1e, 0x3c, 0x78, 0xf0, 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 0xd, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x7, 0xe, 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x9, 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0xb, 0x16, 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x1}
+
+func galAdd(a, b byte) byte {
+ return a ^ b
+}
+
+// Table from https://github.com/templexxx/reedsolomon
+var invTable = [256]byte{0x0, 0x1, 0x8e, 0xf4, 0x47, 0xa7, 0x7a, 0xba, 0xad, 0x9d, 0xdd, 0x98, 0x3d, 0xaa, 0x5d, 0x96, 0xd8, 0x72, 0xc0, 0x58, 0xe0, 0x3e, 0x4c, 0x66, 0x90, 0xde, 0x55, 0x80, 0xa0, 0x83, 0x4b, 0x2a, 0x6c, 0xed, 0x39, 0x51, 0x60, 0x56, 0x2c, 0x8a, 0x70, 0xd0, 0x1f, 0x4a, 0x26, 0x8b, 0x33, 0x6e, 0x48, 0x89, 0x6f, 0x2e, 0xa4, 0xc3, 0x40, 0x5e, 0x50, 0x22, 0xcf, 0xa9, 0xab, 0xc, 0x15, 0xe1, 0x36, 0x5f, 0xf8, 0xd5, 0x92, 0x4e, 0xa6, 0x4, 0x30, 0x88, 0x2b, 0x1e, 0x16, 0x67, 0x45, 0x93, 0x38, 0x23, 0x68, 0x8c, 0x81, 0x1a, 0x25, 0x61, 0x13, 0xc1, 0xcb, 0x63, 0x97, 0xe, 0x37, 0x41, 0x24, 0x57, 0xca, 0x5b, 0xb9, 0xc4, 0x17, 0x4d, 0x52, 0x8d, 0xef, 0xb3, 0x20, 0xec, 0x2f, 0x32, 0x28, 0xd1, 0x11, 0xd9, 0xe9, 0xfb, 0xda, 0x79, 0xdb, 0x77, 0x6, 0xbb, 0x84, 0xcd, 0xfe, 0xfc, 0x1b, 0x54, 0xa1, 0x1d, 0x7c, 0xcc, 0xe4, 0xb0, 0x49, 0x31, 0x27, 0x2d, 0x53, 0x69, 0x2, 0xf5, 0x18, 0xdf, 0x44, 0x4f, 0x9b, 0xbc, 0xf, 0x5c, 0xb, 0xdc, 0xbd, 0x94, 0xac, 0x9, 0xc7, 0xa2, 0x1c, 0x82, 0x9f, 0xc6, 0x34, 0xc2, 0x46, 0x5, 0xce, 0x3b, 0xd, 0x3c, 0x9c, 0x8, 0xbe, 0xb7, 0x87, 0xe5, 0xee, 0x6b, 0xeb, 0xf2, 0xbf, 0xaf, 0xc5, 0x64, 0x7, 0x7b, 0x95, 0x9a, 0xae, 0xb6, 0x12, 0x59, 0xa5, 0x35, 0x65, 0xb8, 0xa3, 0x9e, 0xd2, 0xf7, 0x62, 0x5a, 0x85, 0x7d, 0xa8, 0x3a, 0x29, 0x71, 0xc8, 0xf6, 0xf9, 0x43, 0xd7, 0xd6, 0x10, 0x73, 0x76, 0x78, 0x99, 0xa, 0x19, 0x91, 0x14, 0x3f, 0xe6, 0xf0, 0x86, 0xb1, 0xe2, 0xf1, 0xfa, 0x74, 0xf3, 0xb4, 0x6d, 0x21, 0xb2, 0x6a, 0xe3, 0xe7, 0xb5, 0xea, 0x3, 0x8f, 0xd3, 0xc9, 0x42, 0xd4, 0xe8, 0x75, 0x7f, 0xff, 0x7e, 0xfd}
+
+var mulTable = [256][256]uint8{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff},
+ {0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, 0x1d, 0x1f, 0x19, 0x1b, 0x15, 0x17, 0x11, 0x13, 0xd, 0xf, 0x9, 0xb, 0x5, 0x7, 0x1, 0x3, 0x3d, 0x3f, 0x39, 0x3b, 0x35, 0x37, 0x31, 0x33, 0x2d, 0x2f, 0x29, 0x2b, 0x25, 0x27, 0x21, 0x23, 0x5d, 0x5f, 0x59, 0x5b, 0x55, 0x57, 0x51, 0x53, 0x4d, 0x4f, 0x49, 0x4b, 0x45, 0x47, 0x41, 0x43, 0x7d, 0x7f, 0x79, 0x7b, 0x75, 0x77, 0x71, 0x73, 0x6d, 0x6f, 0x69, 0x6b, 0x65, 0x67, 0x61, 0x63, 0x9d, 0x9f, 0x99, 0x9b, 0x95, 0x97, 0x91, 0x93, 0x8d, 0x8f, 0x89, 0x8b, 0x85, 0x87, 0x81, 0x83, 0xbd, 0xbf, 0xb9, 0xbb, 0xb5, 0xb7, 0xb1, 0xb3, 0xad, 0xaf, 0xa9, 0xab, 0xa5, 0xa7, 0xa1, 0xa3, 0xdd, 0xdf, 0xd9, 0xdb, 0xd5, 0xd7, 0xd1, 0xd3, 0xcd, 0xcf, 0xc9, 0xcb, 0xc5, 0xc7, 0xc1, 0xc3, 0xfd, 0xff, 0xf9, 0xfb, 0xf5, 0xf7, 0xf1, 0xf3, 0xed, 0xef, 0xe9, 0xeb, 0xe5, 0xe7, 0xe1, 0xe3},
+ {0x0, 0x3, 0x6, 0x5, 0xc, 0xf, 0xa, 0x9, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, 0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71, 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41, 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, 0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1, 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1, 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, 0x9d, 0x9e, 0x9b, 0x98, 0x91, 0x92, 0x97, 0x94, 0x85, 0x86, 0x83, 0x80, 0x89, 0x8a, 0x8f, 0x8c, 0xad, 0xae, 0xab, 0xa8, 0xa1, 0xa2, 0xa7, 0xa4, 0xb5, 0xb6, 0xb3, 0xb0, 0xb9, 0xba, 0xbf, 0xbc, 0xfd, 0xfe, 0xfb, 0xf8, 0xf1, 0xf2, 0xf7, 0xf4, 0xe5, 0xe6, 0xe3, 0xe0, 0xe9, 0xea, 0xef, 0xec, 0xcd, 0xce, 0xcb, 0xc8, 0xc1, 0xc2, 0xc7, 0xc4, 0xd5, 0xd6, 0xd3, 0xd0, 0xd9, 0xda, 0xdf, 0xdc, 0x5d, 0x5e, 0x5b, 0x58, 0x51, 0x52, 0x57, 0x54, 0x45, 0x46, 0x43, 0x40, 0x49, 0x4a, 0x4f, 0x4c, 0x6d, 0x6e, 0x6b, 0x68, 0x61, 0x62, 0x67, 0x64, 0x75, 0x76, 0x73, 0x70, 0x79, 0x7a, 0x7f, 0x7c, 0x3d, 0x3e, 0x3b, 0x38, 0x31, 0x32, 0x37, 0x34, 0x25, 0x26, 0x23, 0x20, 0x29, 0x2a, 0x2f, 0x2c, 0xd, 0xe, 0xb, 0x8, 0x1, 0x2, 0x7, 0x4, 0x15, 0x16, 0x13, 0x10, 0x19, 0x1a, 0x1f, 0x1c},
+ {0x0, 0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c, 0x40, 0x44, 0x48, 0x4c, 0x50, 0x54, 0x58, 0x5c, 0x60, 0x64, 0x68, 0x6c, 0x70, 0x74, 0x78, 0x7c, 0x80, 0x84, 0x88, 0x8c, 0x90, 0x94, 0x98, 0x9c, 0xa0, 0xa4, 0xa8, 0xac, 0xb0, 0xb4, 0xb8, 0xbc, 0xc0, 0xc4, 0xc8, 0xcc, 0xd0, 0xd4, 0xd8, 0xdc, 0xe0, 0xe4, 0xe8, 0xec, 0xf0, 0xf4, 0xf8, 0xfc, 0x1d, 0x19, 0x15, 0x11, 0xd, 0x9, 0x5, 0x1, 0x3d, 0x39, 0x35, 0x31, 0x2d, 0x29, 0x25, 0x21, 0x5d, 0x59, 0x55, 0x51, 0x4d, 0x49, 0x45, 0x41, 0x7d, 0x79, 0x75, 0x71, 0x6d, 0x69, 0x65, 0x61, 0x9d, 0x99, 0x95, 0x91, 0x8d, 0x89, 0x85, 0x81, 0xbd, 0xb9, 0xb5, 0xb1, 0xad, 0xa9, 0xa5, 0xa1, 0xdd, 0xd9, 0xd5, 0xd1, 0xcd, 0xc9, 0xc5, 0xc1, 0xfd, 0xf9, 0xf5, 0xf1, 0xed, 0xe9, 0xe5, 0xe1, 0x3a, 0x3e, 0x32, 0x36, 0x2a, 0x2e, 0x22, 0x26, 0x1a, 0x1e, 0x12, 0x16, 0xa, 0xe, 0x2, 0x6, 0x7a, 0x7e, 0x72, 0x76, 0x6a, 0x6e, 0x62, 0x66, 0x5a, 0x5e, 0x52, 0x56, 0x4a, 0x4e, 0x42, 0x46, 0xba, 0xbe, 0xb2, 0xb6, 0xaa, 0xae, 0xa2, 0xa6, 0x9a, 0x9e, 0x92, 0x96, 0x8a, 0x8e, 0x82, 0x86, 0xfa, 0xfe, 0xf2, 0xf6, 0xea, 0xee, 0xe2, 0xe6, 0xda, 0xde, 0xd2, 0xd6, 0xca, 0xce, 0xc2, 0xc6, 0x27, 0x23, 0x2f, 0x2b, 0x37, 0x33, 0x3f, 0x3b, 0x7, 0x3, 0xf, 0xb, 0x17, 0x13, 0x1f, 0x1b, 0x67, 0x63, 0x6f, 0x6b, 0x77, 0x73, 0x7f, 0x7b, 0x47, 0x43, 0x4f, 0x4b, 0x57, 0x53, 0x5f, 0x5b, 0xa7, 0xa3, 0xaf, 0xab, 0xb7, 0xb3, 0xbf, 0xbb, 0x87, 0x83, 0x8f, 0x8b, 0x97, 0x93, 0x9f, 0x9b, 0xe7, 0xe3, 0xef, 0xeb, 0xf7, 0xf3, 0xff, 0xfb, 0xc7, 0xc3, 0xcf, 0xcb, 0xd7, 0xd3, 0xdf, 0xdb},
+ {0x0, 0x5, 0xa, 0xf, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33, 0x50, 0x55, 0x5a, 0x5f, 0x44, 0x41, 0x4e, 0x4b, 0x78, 0x7d, 0x72, 0x77, 0x6c, 0x69, 0x66, 0x63, 0xa0, 0xa5, 0xaa, 0xaf, 0xb4, 0xb1, 0xbe, 0xbb, 0x88, 0x8d, 0x82, 0x87, 0x9c, 0x99, 0x96, 0x93, 0xf0, 0xf5, 0xfa, 0xff, 0xe4, 0xe1, 0xee, 0xeb, 0xd8, 0xdd, 0xd2, 0xd7, 0xcc, 0xc9, 0xc6, 0xc3, 0x5d, 0x58, 0x57, 0x52, 0x49, 0x4c, 0x43, 0x46, 0x75, 0x70, 0x7f, 0x7a, 0x61, 0x64, 0x6b, 0x6e, 0xd, 0x8, 0x7, 0x2, 0x19, 0x1c, 0x13, 0x16, 0x25, 0x20, 0x2f, 0x2a, 0x31, 0x34, 0x3b, 0x3e, 0xfd, 0xf8, 0xf7, 0xf2, 0xe9, 0xec, 0xe3, 0xe6, 0xd5, 0xd0, 0xdf, 0xda, 0xc1, 0xc4, 0xcb, 0xce, 0xad, 0xa8, 0xa7, 0xa2, 0xb9, 0xbc, 0xb3, 0xb6, 0x85, 0x80, 0x8f, 0x8a, 0x91, 0x94, 0x9b, 0x9e, 0xba, 0xbf, 0xb0, 0xb5, 0xae, 0xab, 0xa4, 0xa1, 0x92, 0x97, 0x98, 0x9d, 0x86, 0x83, 0x8c, 0x89, 0xea, 0xef, 0xe0, 0xe5, 0xfe, 0xfb, 0xf4, 0xf1, 0xc2, 0xc7, 0xc8, 0xcd, 0xd6, 0xd3, 0xdc, 0xd9, 0x1a, 0x1f, 0x10, 0x15, 0xe, 0xb, 0x4, 0x1, 0x32, 0x37, 0x38, 0x3d, 0x26, 0x23, 0x2c, 0x29, 0x4a, 0x4f, 0x40, 0x45, 0x5e, 0x5b, 0x54, 0x51, 0x62, 0x67, 0x68, 0x6d, 0x76, 0x73, 0x7c, 0x79, 0xe7, 0xe2, 0xed, 0xe8, 0xf3, 0xf6, 0xf9, 0xfc, 0xcf, 0xca, 0xc5, 0xc0, 0xdb, 0xde, 0xd1, 0xd4, 0xb7, 0xb2, 0xbd, 0xb8, 0xa3, 0xa6, 0xa9, 0xac, 0x9f, 0x9a, 0x95, 0x90, 0x8b, 0x8e, 0x81, 0x84, 0x47, 0x42, 0x4d, 0x48, 0x53, 0x56, 0x59, 0x5c, 0x6f, 0x6a, 0x65, 0x60, 0x7b, 0x7e, 0x71, 0x74, 0x17, 0x12, 0x1d, 0x18, 0x3, 0x6, 0x9, 0xc, 0x3f, 0x3a, 0x35, 0x30, 0x2b, 0x2e, 0x21, 0x24},
+ {0x0, 0x6, 0xc, 0xa, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22, 0x60, 0x66, 0x6c, 0x6a, 0x78, 0x7e, 0x74, 0x72, 0x50, 0x56, 0x5c, 0x5a, 0x48, 0x4e, 0x44, 0x42, 0xc0, 0xc6, 0xcc, 0xca, 0xd8, 0xde, 0xd4, 0xd2, 0xf0, 0xf6, 0xfc, 0xfa, 0xe8, 0xee, 0xe4, 0xe2, 0xa0, 0xa6, 0xac, 0xaa, 0xb8, 0xbe, 0xb4, 0xb2, 0x90, 0x96, 0x9c, 0x9a, 0x88, 0x8e, 0x84, 0x82, 0x9d, 0x9b, 0x91, 0x97, 0x85, 0x83, 0x89, 0x8f, 0xad, 0xab, 0xa1, 0xa7, 0xb5, 0xb3, 0xb9, 0xbf, 0xfd, 0xfb, 0xf1, 0xf7, 0xe5, 0xe3, 0xe9, 0xef, 0xcd, 0xcb, 0xc1, 0xc7, 0xd5, 0xd3, 0xd9, 0xdf, 0x5d, 0x5b, 0x51, 0x57, 0x45, 0x43, 0x49, 0x4f, 0x6d, 0x6b, 0x61, 0x67, 0x75, 0x73, 0x79, 0x7f, 0x3d, 0x3b, 0x31, 0x37, 0x25, 0x23, 0x29, 0x2f, 0xd, 0xb, 0x1, 0x7, 0x15, 0x13, 0x19, 0x1f, 0x27, 0x21, 0x2b, 0x2d, 0x3f, 0x39, 0x33, 0x35, 0x17, 0x11, 0x1b, 0x1d, 0xf, 0x9, 0x3, 0x5, 0x47, 0x41, 0x4b, 0x4d, 0x5f, 0x59, 0x53, 0x55, 0x77, 0x71, 0x7b, 0x7d, 0x6f, 0x69, 0x63, 0x65, 0xe7, 0xe1, 0xeb, 0xed, 0xff, 0xf9, 0xf3, 0xf5, 0xd7, 0xd1, 0xdb, 0xdd, 0xcf, 0xc9, 0xc3, 0xc5, 0x87, 0x81, 0x8b, 0x8d, 0x9f, 0x99, 0x93, 0x95, 0xb7, 0xb1, 0xbb, 0xbd, 0xaf, 0xa9, 0xa3, 0xa5, 0xba, 0xbc, 0xb6, 0xb0, 0xa2, 0xa4, 0xae, 0xa8, 0x8a, 0x8c, 0x86, 0x80, 0x92, 0x94, 0x9e, 0x98, 0xda, 0xdc, 0xd6, 0xd0, 0xc2, 0xc4, 0xce, 0xc8, 0xea, 0xec, 0xe6, 0xe0, 0xf2, 0xf4, 0xfe, 0xf8, 0x7a, 0x7c, 0x76, 0x70, 0x62, 0x64, 0x6e, 0x68, 0x4a, 0x4c, 0x46, 0x40, 0x52, 0x54, 0x5e, 0x58, 0x1a, 0x1c, 0x16, 0x10, 0x2, 0x4, 0xe, 0x8, 0x2a, 0x2c, 0x26, 0x20, 0x32, 0x34, 0x3e, 0x38},
+ {0x0, 0x7, 0xe, 0x9, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d, 0x70, 0x77, 0x7e, 0x79, 0x6c, 0x6b, 0x62, 0x65, 0x48, 0x4f, 0x46, 0x41, 0x54, 0x53, 0x5a, 0x5d, 0xe0, 0xe7, 0xee, 0xe9, 0xfc, 0xfb, 0xf2, 0xf5, 0xd8, 0xdf, 0xd6, 0xd1, 0xc4, 0xc3, 0xca, 0xcd, 0x90, 0x97, 0x9e, 0x99, 0x8c, 0x8b, 0x82, 0x85, 0xa8, 0xaf, 0xa6, 0xa1, 0xb4, 0xb3, 0xba, 0xbd, 0xdd, 0xda, 0xd3, 0xd4, 0xc1, 0xc6, 0xcf, 0xc8, 0xe5, 0xe2, 0xeb, 0xec, 0xf9, 0xfe, 0xf7, 0xf0, 0xad, 0xaa, 0xa3, 0xa4, 0xb1, 0xb6, 0xbf, 0xb8, 0x95, 0x92, 0x9b, 0x9c, 0x89, 0x8e, 0x87, 0x80, 0x3d, 0x3a, 0x33, 0x34, 0x21, 0x26, 0x2f, 0x28, 0x5, 0x2, 0xb, 0xc, 0x19, 0x1e, 0x17, 0x10, 0x4d, 0x4a, 0x43, 0x44, 0x51, 0x56, 0x5f, 0x58, 0x75, 0x72, 0x7b, 0x7c, 0x69, 0x6e, 0x67, 0x60, 0xa7, 0xa0, 0xa9, 0xae, 0xbb, 0xbc, 0xb5, 0xb2, 0x9f, 0x98, 0x91, 0x96, 0x83, 0x84, 0x8d, 0x8a, 0xd7, 0xd0, 0xd9, 0xde, 0xcb, 0xcc, 0xc5, 0xc2, 0xef, 0xe8, 0xe1, 0xe6, 0xf3, 0xf4, 0xfd, 0xfa, 0x47, 0x40, 0x49, 0x4e, 0x5b, 0x5c, 0x55, 0x52, 0x7f, 0x78, 0x71, 0x76, 0x63, 0x64, 0x6d, 0x6a, 0x37, 0x30, 0x39, 0x3e, 0x2b, 0x2c, 0x25, 0x22, 0xf, 0x8, 0x1, 0x6, 0x13, 0x14, 0x1d, 0x1a, 0x7a, 0x7d, 0x74, 0x73, 0x66, 0x61, 0x68, 0x6f, 0x42, 0x45, 0x4c, 0x4b, 0x5e, 0x59, 0x50, 0x57, 0xa, 0xd, 0x4, 0x3, 0x16, 0x11, 0x18, 0x1f, 0x32, 0x35, 0x3c, 0x3b, 0x2e, 0x29, 0x20, 0x27, 0x9a, 0x9d, 0x94, 0x93, 0x86, 0x81, 0x88, 0x8f, 0xa2, 0xa5, 0xac, 0xab, 0xbe, 0xb9, 0xb0, 0xb7, 0xea, 0xed, 0xe4, 0xe3, 0xf6, 0xf1, 0xf8, 0xff, 0xd2, 0xd5, 0xdc, 0xdb, 0xce, 0xc9, 0xc0, 0xc7},
+ {0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78, 0x80, 0x88, 0x90, 0x98, 0xa0, 0xa8, 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, 0xd8, 0xe0, 0xe8, 0xf0, 0xf8, 0x1d, 0x15, 0xd, 0x5, 0x3d, 0x35, 0x2d, 0x25, 0x5d, 0x55, 0x4d, 0x45, 0x7d, 0x75, 0x6d, 0x65, 0x9d, 0x95, 0x8d, 0x85, 0xbd, 0xb5, 0xad, 0xa5, 0xdd, 0xd5, 0xcd, 0xc5, 0xfd, 0xf5, 0xed, 0xe5, 0x3a, 0x32, 0x2a, 0x22, 0x1a, 0x12, 0xa, 0x2, 0x7a, 0x72, 0x6a, 0x62, 0x5a, 0x52, 0x4a, 0x42, 0xba, 0xb2, 0xaa, 0xa2, 0x9a, 0x92, 0x8a, 0x82, 0xfa, 0xf2, 0xea, 0xe2, 0xda, 0xd2, 0xca, 0xc2, 0x27, 0x2f, 0x37, 0x3f, 0x7, 0xf, 0x17, 0x1f, 0x67, 0x6f, 0x77, 0x7f, 0x47, 0x4f, 0x57, 0x5f, 0xa7, 0xaf, 0xb7, 0xbf, 0x87, 0x8f, 0x97, 0x9f, 0xe7, 0xef, 0xf7, 0xff, 0xc7, 0xcf, 0xd7, 0xdf, 0x74, 0x7c, 0x64, 0x6c, 0x54, 0x5c, 0x44, 0x4c, 0x34, 0x3c, 0x24, 0x2c, 0x14, 0x1c, 0x4, 0xc, 0xf4, 0xfc, 0xe4, 0xec, 0xd4, 0xdc, 0xc4, 0xcc, 0xb4, 0xbc, 0xa4, 0xac, 0x94, 0x9c, 0x84, 0x8c, 0x69, 0x61, 0x79, 0x71, 0x49, 0x41, 0x59, 0x51, 0x29, 0x21, 0x39, 0x31, 0x9, 0x1, 0x19, 0x11, 0xe9, 0xe1, 0xf9, 0xf1, 0xc9, 0xc1, 0xd9, 0xd1, 0xa9, 0xa1, 0xb9, 0xb1, 0x89, 0x81, 0x99, 0x91, 0x4e, 0x46, 0x5e, 0x56, 0x6e, 0x66, 0x7e, 0x76, 0xe, 0x6, 0x1e, 0x16, 0x2e, 0x26, 0x3e, 0x36, 0xce, 0xc6, 0xde, 0xd6, 0xee, 0xe6, 0xfe, 0xf6, 0x8e, 0x86, 0x9e, 0x96, 0xae, 0xa6, 0xbe, 0xb6, 0x53, 0x5b, 0x43, 0x4b, 0x73, 0x7b, 0x63, 0x6b, 0x13, 0x1b, 0x3, 0xb, 0x33, 0x3b, 0x23, 0x2b, 0xd3, 0xdb, 0xc3, 0xcb, 0xf3, 0xfb, 0xe3, 0xeb, 0x93, 0x9b, 0x83, 0x8b, 0xb3, 0xbb, 0xa3, 0xab},
+ {0x0, 0x9, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, 0x3d, 0x34, 0x2f, 0x26, 0x19, 0x10, 0xb, 0x2, 0x75, 0x7c, 0x67, 0x6e, 0x51, 0x58, 0x43, 0x4a, 0xad, 0xa4, 0xbf, 0xb6, 0x89, 0x80, 0x9b, 0x92, 0xe5, 0xec, 0xf7, 0xfe, 0xc1, 0xc8, 0xd3, 0xda, 0x7a, 0x73, 0x68, 0x61, 0x5e, 0x57, 0x4c, 0x45, 0x32, 0x3b, 0x20, 0x29, 0x16, 0x1f, 0x4, 0xd, 0xea, 0xe3, 0xf8, 0xf1, 0xce, 0xc7, 0xdc, 0xd5, 0xa2, 0xab, 0xb0, 0xb9, 0x86, 0x8f, 0x94, 0x9d, 0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0xf, 0x6, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, 0xf4, 0xfd, 0xe6, 0xef, 0xd0, 0xd9, 0xc2, 0xcb, 0xbc, 0xb5, 0xae, 0xa7, 0x98, 0x91, 0x8a, 0x83, 0x64, 0x6d, 0x76, 0x7f, 0x40, 0x49, 0x52, 0x5b, 0x2c, 0x25, 0x3e, 0x37, 0x8, 0x1, 0x1a, 0x13, 0xc9, 0xc0, 0xdb, 0xd2, 0xed, 0xe4, 0xff, 0xf6, 0x81, 0x88, 0x93, 0x9a, 0xa5, 0xac, 0xb7, 0xbe, 0x59, 0x50, 0x4b, 0x42, 0x7d, 0x74, 0x6f, 0x66, 0x11, 0x18, 0x3, 0xa, 0x35, 0x3c, 0x27, 0x2e, 0x8e, 0x87, 0x9c, 0x95, 0xaa, 0xa3, 0xb8, 0xb1, 0xc6, 0xcf, 0xd4, 0xdd, 0xe2, 0xeb, 0xf0, 0xf9, 0x1e, 0x17, 0xc, 0x5, 0x3a, 0x33, 0x28, 0x21, 0x56, 0x5f, 0x44, 0x4d, 0x72, 0x7b, 0x60, 0x69, 0xb3, 0xba, 0xa1, 0xa8, 0x97, 0x9e, 0x85, 0x8c, 0xfb, 0xf2, 0xe9, 0xe0, 0xdf, 0xd6, 0xcd, 0xc4, 0x23, 0x2a, 0x31, 0x38, 0x7, 0xe, 0x15, 0x1c, 0x6b, 0x62, 0x79, 0x70, 0x4f, 0x46, 0x5d, 0x54},
+ {0x0, 0xa, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66, 0xa0, 0xaa, 0xb4, 0xbe, 0x88, 0x82, 0x9c, 0x96, 0xf0, 0xfa, 0xe4, 0xee, 0xd8, 0xd2, 0xcc, 0xc6, 0x5d, 0x57, 0x49, 0x43, 0x75, 0x7f, 0x61, 0x6b, 0xd, 0x7, 0x19, 0x13, 0x25, 0x2f, 0x31, 0x3b, 0xfd, 0xf7, 0xe9, 0xe3, 0xd5, 0xdf, 0xc1, 0xcb, 0xad, 0xa7, 0xb9, 0xb3, 0x85, 0x8f, 0x91, 0x9b, 0xba, 0xb0, 0xae, 0xa4, 0x92, 0x98, 0x86, 0x8c, 0xea, 0xe0, 0xfe, 0xf4, 0xc2, 0xc8, 0xd6, 0xdc, 0x1a, 0x10, 0xe, 0x4, 0x32, 0x38, 0x26, 0x2c, 0x4a, 0x40, 0x5e, 0x54, 0x62, 0x68, 0x76, 0x7c, 0xe7, 0xed, 0xf3, 0xf9, 0xcf, 0xc5, 0xdb, 0xd1, 0xb7, 0xbd, 0xa3, 0xa9, 0x9f, 0x95, 0x8b, 0x81, 0x47, 0x4d, 0x53, 0x59, 0x6f, 0x65, 0x7b, 0x71, 0x17, 0x1d, 0x3, 0x9, 0x3f, 0x35, 0x2b, 0x21, 0x69, 0x63, 0x7d, 0x77, 0x41, 0x4b, 0x55, 0x5f, 0x39, 0x33, 0x2d, 0x27, 0x11, 0x1b, 0x5, 0xf, 0xc9, 0xc3, 0xdd, 0xd7, 0xe1, 0xeb, 0xf5, 0xff, 0x99, 0x93, 0x8d, 0x87, 0xb1, 0xbb, 0xa5, 0xaf, 0x34, 0x3e, 0x20, 0x2a, 0x1c, 0x16, 0x8, 0x2, 0x64, 0x6e, 0x70, 0x7a, 0x4c, 0x46, 0x58, 0x52, 0x94, 0x9e, 0x80, 0x8a, 0xbc, 0xb6, 0xa8, 0xa2, 0xc4, 0xce, 0xd0, 0xda, 0xec, 0xe6, 0xf8, 0xf2, 0xd3, 0xd9, 0xc7, 0xcd, 0xfb, 0xf1, 0xef, 0xe5, 0x83, 0x89, 0x97, 0x9d, 0xab, 0xa1, 0xbf, 0xb5, 0x73, 0x79, 0x67, 0x6d, 0x5b, 0x51, 0x4f, 0x45, 0x23, 0x29, 0x37, 0x3d, 0xb, 0x1, 0x1f, 0x15, 0x8e, 0x84, 0x9a, 0x90, 0xa6, 0xac, 0xb2, 0xb8, 0xde, 0xd4, 0xca, 0xc0, 0xf6, 0xfc, 0xe2, 0xe8, 0x2e, 0x24, 0x3a, 0x30, 0x6, 0xc, 0x12, 0x18, 0x7e, 0x74, 0x6a, 0x60, 0x56, 0x5c, 0x42, 0x48},
+ {0x0, 0xb, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, 0x7d, 0x76, 0x6b, 0x60, 0x51, 0x5a, 0x47, 0x4c, 0x25, 0x2e, 0x33, 0x38, 0x9, 0x2, 0x1f, 0x14, 0xcd, 0xc6, 0xdb, 0xd0, 0xe1, 0xea, 0xf7, 0xfc, 0x95, 0x9e, 0x83, 0x88, 0xb9, 0xb2, 0xaf, 0xa4, 0xfa, 0xf1, 0xec, 0xe7, 0xd6, 0xdd, 0xc0, 0xcb, 0xa2, 0xa9, 0xb4, 0xbf, 0x8e, 0x85, 0x98, 0x93, 0x4a, 0x41, 0x5c, 0x57, 0x66, 0x6d, 0x70, 0x7b, 0x12, 0x19, 0x4, 0xf, 0x3e, 0x35, 0x28, 0x23, 0x87, 0x8c, 0x91, 0x9a, 0xab, 0xa0, 0xbd, 0xb6, 0xdf, 0xd4, 0xc9, 0xc2, 0xf3, 0xf8, 0xe5, 0xee, 0x37, 0x3c, 0x21, 0x2a, 0x1b, 0x10, 0xd, 0x6, 0x6f, 0x64, 0x79, 0x72, 0x43, 0x48, 0x55, 0x5e, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68, 0x1, 0xa, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x94, 0x9f, 0x82, 0x89, 0xb8, 0xb3, 0xae, 0xa5, 0xcc, 0xc7, 0xda, 0xd1, 0xe0, 0xeb, 0xf6, 0xfd, 0x24, 0x2f, 0x32, 0x39, 0x8, 0x3, 0x1e, 0x15, 0x7c, 0x77, 0x6a, 0x61, 0x50, 0x5b, 0x46, 0x4d, 0x13, 0x18, 0x5, 0xe, 0x3f, 0x34, 0x29, 0x22, 0x4b, 0x40, 0x5d, 0x56, 0x67, 0x6c, 0x71, 0x7a, 0xa3, 0xa8, 0xb5, 0xbe, 0x8f, 0x84, 0x99, 0x92, 0xfb, 0xf0, 0xed, 0xe6, 0xd7, 0xdc, 0xc1, 0xca, 0x6e, 0x65, 0x78, 0x73, 0x42, 0x49, 0x54, 0x5f, 0x36, 0x3d, 0x20, 0x2b, 0x1a, 0x11, 0xc, 0x7, 0xde, 0xd5, 0xc8, 0xc3, 0xf2, 0xf9, 0xe4, 0xef, 0x86, 0x8d, 0x90, 0x9b, 0xaa, 0xa1, 0xbc, 0xb7},
+ {0x0, 0xc, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44, 0xc0, 0xcc, 0xd8, 0xd4, 0xf0, 0xfc, 0xe8, 0xe4, 0xa0, 0xac, 0xb8, 0xb4, 0x90, 0x9c, 0x88, 0x84, 0x9d, 0x91, 0x85, 0x89, 0xad, 0xa1, 0xb5, 0xb9, 0xfd, 0xf1, 0xe5, 0xe9, 0xcd, 0xc1, 0xd5, 0xd9, 0x5d, 0x51, 0x45, 0x49, 0x6d, 0x61, 0x75, 0x79, 0x3d, 0x31, 0x25, 0x29, 0xd, 0x1, 0x15, 0x19, 0x27, 0x2b, 0x3f, 0x33, 0x17, 0x1b, 0xf, 0x3, 0x47, 0x4b, 0x5f, 0x53, 0x77, 0x7b, 0x6f, 0x63, 0xe7, 0xeb, 0xff, 0xf3, 0xd7, 0xdb, 0xcf, 0xc3, 0x87, 0x8b, 0x9f, 0x93, 0xb7, 0xbb, 0xaf, 0xa3, 0xba, 0xb6, 0xa2, 0xae, 0x8a, 0x86, 0x92, 0x9e, 0xda, 0xd6, 0xc2, 0xce, 0xea, 0xe6, 0xf2, 0xfe, 0x7a, 0x76, 0x62, 0x6e, 0x4a, 0x46, 0x52, 0x5e, 0x1a, 0x16, 0x2, 0xe, 0x2a, 0x26, 0x32, 0x3e, 0x4e, 0x42, 0x56, 0x5a, 0x7e, 0x72, 0x66, 0x6a, 0x2e, 0x22, 0x36, 0x3a, 0x1e, 0x12, 0x6, 0xa, 0x8e, 0x82, 0x96, 0x9a, 0xbe, 0xb2, 0xa6, 0xaa, 0xee, 0xe2, 0xf6, 0xfa, 0xde, 0xd2, 0xc6, 0xca, 0xd3, 0xdf, 0xcb, 0xc7, 0xe3, 0xef, 0xfb, 0xf7, 0xb3, 0xbf, 0xab, 0xa7, 0x83, 0x8f, 0x9b, 0x97, 0x13, 0x1f, 0xb, 0x7, 0x23, 0x2f, 0x3b, 0x37, 0x73, 0x7f, 0x6b, 0x67, 0x43, 0x4f, 0x5b, 0x57, 0x69, 0x65, 0x71, 0x7d, 0x59, 0x55, 0x41, 0x4d, 0x9, 0x5, 0x11, 0x1d, 0x39, 0x35, 0x21, 0x2d, 0xa9, 0xa5, 0xb1, 0xbd, 0x99, 0x95, 0x81, 0x8d, 0xc9, 0xc5, 0xd1, 0xdd, 0xf9, 0xf5, 0xe1, 0xed, 0xf4, 0xf8, 0xec, 0xe0, 0xc4, 0xc8, 0xdc, 0xd0, 0x94, 0x98, 0x8c, 0x80, 0xa4, 0xa8, 0xbc, 0xb0, 0x34, 0x38, 0x2c, 0x20, 0x4, 0x8, 0x1c, 0x10, 0x54, 0x58, 0x4c, 0x40, 0x64, 0x68, 0x7c, 0x70},
+ {0x0, 0xd, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, 0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x5, 0x8, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0xf, 0x2, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc, 0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, 0xa, 0x7, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41, 0xce, 0xc3, 0xd4, 0xd9, 0xfa, 0xf7, 0xe0, 0xed, 0xa6, 0xab, 0xbc, 0xb1, 0x92, 0x9f, 0x88, 0x85, 0x1e, 0x13, 0x4, 0x9, 0x2a, 0x27, 0x30, 0x3d, 0x76, 0x7b, 0x6c, 0x61, 0x42, 0x4f, 0x58, 0x55, 0x73, 0x7e, 0x69, 0x64, 0x47, 0x4a, 0x5d, 0x50, 0x1b, 0x16, 0x1, 0xc, 0x2f, 0x22, 0x35, 0x38, 0xa3, 0xae, 0xb9, 0xb4, 0x97, 0x9a, 0x8d, 0x80, 0xcb, 0xc6, 0xd1, 0xdc, 0xff, 0xf2, 0xe5, 0xe8, 0xa9, 0xa4, 0xb3, 0xbe, 0x9d, 0x90, 0x87, 0x8a, 0xc1, 0xcc, 0xdb, 0xd6, 0xf5, 0xf8, 0xef, 0xe2, 0x79, 0x74, 0x63, 0x6e, 0x4d, 0x40, 0x57, 0x5a, 0x11, 0x1c, 0xb, 0x6, 0x25, 0x28, 0x3f, 0x32, 0x14, 0x19, 0xe, 0x3, 0x20, 0x2d, 0x3a, 0x37, 0x7c, 0x71, 0x66, 0x6b, 0x48, 0x45, 0x52, 0x5f, 0xc4, 0xc9, 0xde, 0xd3, 0xf0, 0xfd, 0xea, 0xe7, 0xac, 0xa1, 0xb6, 0xbb, 0x98, 0x95, 0x82, 0x8f},
+ {0x0, 0xe, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0x3d, 0x33, 0x21, 0x2f, 0x5, 0xb, 0x19, 0x17, 0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d, 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, 0x37, 0x39, 0x2b, 0x25, 0xf, 0x1, 0x13, 0x1d, 0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0xa, 0x4, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, 0x53, 0x5d, 0x4f, 0x41, 0x6b, 0x65, 0x77, 0x79, 0x23, 0x2d, 0x3f, 0x31, 0x1b, 0x15, 0x7, 0x9, 0xb3, 0xbd, 0xaf, 0xa1, 0x8b, 0x85, 0x97, 0x99, 0xc3, 0xcd, 0xdf, 0xd1, 0xfb, 0xf5, 0xe7, 0xe9, 0x8e, 0x80, 0x92, 0x9c, 0xb6, 0xb8, 0xaa, 0xa4, 0xfe, 0xf0, 0xe2, 0xec, 0xc6, 0xc8, 0xda, 0xd4, 0x6e, 0x60, 0x72, 0x7c, 0x56, 0x58, 0x4a, 0x44, 0x1e, 0x10, 0x2, 0xc, 0x26, 0x28, 0x3a, 0x34, 0xf4, 0xfa, 0xe8, 0xe6, 0xcc, 0xc2, 0xd0, 0xde, 0x84, 0x8a, 0x98, 0x96, 0xbc, 0xb2, 0xa0, 0xae, 0x14, 0x1a, 0x8, 0x6, 0x2c, 0x22, 0x30, 0x3e, 0x64, 0x6a, 0x78, 0x76, 0x5c, 0x52, 0x40, 0x4e, 0x29, 0x27, 0x35, 0x3b, 0x11, 0x1f, 0xd, 0x3, 0x59, 0x57, 0x45, 0x4b, 0x61, 0x6f, 0x7d, 0x73, 0xc9, 0xc7, 0xd5, 0xdb, 0xf1, 0xff, 0xed, 0xe3, 0xb9, 0xb7, 0xa5, 0xab, 0x81, 0x8f, 0x9d, 0x93},
+ {0x0, 0xf, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55, 0xf0, 0xff, 0xee, 0xe1, 0xcc, 0xc3, 0xd2, 0xdd, 0x88, 0x87, 0x96, 0x99, 0xb4, 0xbb, 0xaa, 0xa5, 0xfd, 0xf2, 0xe3, 0xec, 0xc1, 0xce, 0xdf, 0xd0, 0x85, 0x8a, 0x9b, 0x94, 0xb9, 0xb6, 0xa7, 0xa8, 0xd, 0x2, 0x13, 0x1c, 0x31, 0x3e, 0x2f, 0x20, 0x75, 0x7a, 0x6b, 0x64, 0x49, 0x46, 0x57, 0x58, 0xe7, 0xe8, 0xf9, 0xf6, 0xdb, 0xd4, 0xc5, 0xca, 0x9f, 0x90, 0x81, 0x8e, 0xa3, 0xac, 0xbd, 0xb2, 0x17, 0x18, 0x9, 0x6, 0x2b, 0x24, 0x35, 0x3a, 0x6f, 0x60, 0x71, 0x7e, 0x53, 0x5c, 0x4d, 0x42, 0x1a, 0x15, 0x4, 0xb, 0x26, 0x29, 0x38, 0x37, 0x62, 0x6d, 0x7c, 0x73, 0x5e, 0x51, 0x40, 0x4f, 0xea, 0xe5, 0xf4, 0xfb, 0xd6, 0xd9, 0xc8, 0xc7, 0x92, 0x9d, 0x8c, 0x83, 0xae, 0xa1, 0xb0, 0xbf, 0xd3, 0xdc, 0xcd, 0xc2, 0xef, 0xe0, 0xf1, 0xfe, 0xab, 0xa4, 0xb5, 0xba, 0x97, 0x98, 0x89, 0x86, 0x23, 0x2c, 0x3d, 0x32, 0x1f, 0x10, 0x1, 0xe, 0x5b, 0x54, 0x45, 0x4a, 0x67, 0x68, 0x79, 0x76, 0x2e, 0x21, 0x30, 0x3f, 0x12, 0x1d, 0xc, 0x3, 0x56, 0x59, 0x48, 0x47, 0x6a, 0x65, 0x74, 0x7b, 0xde, 0xd1, 0xc0, 0xcf, 0xe2, 0xed, 0xfc, 0xf3, 0xa6, 0xa9, 0xb8, 0xb7, 0x9a, 0x95, 0x84, 0x8b, 0x34, 0x3b, 0x2a, 0x25, 0x8, 0x7, 0x16, 0x19, 0x4c, 0x43, 0x52, 0x5d, 0x70, 0x7f, 0x6e, 0x61, 0xc4, 0xcb, 0xda, 0xd5, 0xf8, 0xf7, 0xe6, 0xe9, 0xbc, 0xb3, 0xa2, 0xad, 0x80, 0x8f, 0x9e, 0x91, 0xc9, 0xc6, 0xd7, 0xd8, 0xf5, 0xfa, 0xeb, 0xe4, 0xb1, 0xbe, 0xaf, 0xa0, 0x8d, 0x82, 0x93, 0x9c, 0x39, 0x36, 0x27, 0x28, 0x5, 0xa, 0x1b, 0x14, 0x41, 0x4e, 0x5f, 0x50, 0x7d, 0x72, 0x63, 0x6c},
+ {0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0x1d, 0xd, 0x3d, 0x2d, 0x5d, 0x4d, 0x7d, 0x6d, 0x9d, 0x8d, 0xbd, 0xad, 0xdd, 0xcd, 0xfd, 0xed, 0x3a, 0x2a, 0x1a, 0xa, 0x7a, 0x6a, 0x5a, 0x4a, 0xba, 0xaa, 0x9a, 0x8a, 0xfa, 0xea, 0xda, 0xca, 0x27, 0x37, 0x7, 0x17, 0x67, 0x77, 0x47, 0x57, 0xa7, 0xb7, 0x87, 0x97, 0xe7, 0xf7, 0xc7, 0xd7, 0x74, 0x64, 0x54, 0x44, 0x34, 0x24, 0x14, 0x4, 0xf4, 0xe4, 0xd4, 0xc4, 0xb4, 0xa4, 0x94, 0x84, 0x69, 0x79, 0x49, 0x59, 0x29, 0x39, 0x9, 0x19, 0xe9, 0xf9, 0xc9, 0xd9, 0xa9, 0xb9, 0x89, 0x99, 0x4e, 0x5e, 0x6e, 0x7e, 0xe, 0x1e, 0x2e, 0x3e, 0xce, 0xde, 0xee, 0xfe, 0x8e, 0x9e, 0xae, 0xbe, 0x53, 0x43, 0x73, 0x63, 0x13, 0x3, 0x33, 0x23, 0xd3, 0xc3, 0xf3, 0xe3, 0x93, 0x83, 0xb3, 0xa3, 0xe8, 0xf8, 0xc8, 0xd8, 0xa8, 0xb8, 0x88, 0x98, 0x68, 0x78, 0x48, 0x58, 0x28, 0x38, 0x8, 0x18, 0xf5, 0xe5, 0xd5, 0xc5, 0xb5, 0xa5, 0x95, 0x85, 0x75, 0x65, 0x55, 0x45, 0x35, 0x25, 0x15, 0x5, 0xd2, 0xc2, 0xf2, 0xe2, 0x92, 0x82, 0xb2, 0xa2, 0x52, 0x42, 0x72, 0x62, 0x12, 0x2, 0x32, 0x22, 0xcf, 0xdf, 0xef, 0xff, 0x8f, 0x9f, 0xaf, 0xbf, 0x4f, 0x5f, 0x6f, 0x7f, 0xf, 0x1f, 0x2f, 0x3f, 0x9c, 0x8c, 0xbc, 0xac, 0xdc, 0xcc, 0xfc, 0xec, 0x1c, 0xc, 0x3c, 0x2c, 0x5c, 0x4c, 0x7c, 0x6c, 0x81, 0x91, 0xa1, 0xb1, 0xc1, 0xd1, 0xe1, 0xf1, 0x1, 0x11, 0x21, 0x31, 0x41, 0x51, 0x61, 0x71, 0xa6, 0xb6, 0x86, 0x96, 0xe6, 0xf6, 0xc6, 0xd6, 0x26, 0x36, 0x6, 0x16, 0x66, 0x76, 0x46, 0x56, 0xbb, 0xab, 0x9b, 0x8b, 0xfb, 0xeb, 0xdb, 0xcb, 0x3b, 0x2b, 0x1b, 0xb, 0x7b, 0x6b, 0x5b, 0x4b},
+ {0x0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0xd, 0x1c, 0x2f, 0x3e, 0x49, 0x58, 0x6b, 0x7a, 0x85, 0x94, 0xa7, 0xb6, 0xc1, 0xd0, 0xe3, 0xf2, 0x1a, 0xb, 0x38, 0x29, 0x5e, 0x4f, 0x7c, 0x6d, 0x92, 0x83, 0xb0, 0xa1, 0xd6, 0xc7, 0xf4, 0xe5, 0x17, 0x6, 0x35, 0x24, 0x53, 0x42, 0x71, 0x60, 0x9f, 0x8e, 0xbd, 0xac, 0xdb, 0xca, 0xf9, 0xe8, 0x34, 0x25, 0x16, 0x7, 0x70, 0x61, 0x52, 0x43, 0xbc, 0xad, 0x9e, 0x8f, 0xf8, 0xe9, 0xda, 0xcb, 0x39, 0x28, 0x1b, 0xa, 0x7d, 0x6c, 0x5f, 0x4e, 0xb1, 0xa0, 0x93, 0x82, 0xf5, 0xe4, 0xd7, 0xc6, 0x2e, 0x3f, 0xc, 0x1d, 0x6a, 0x7b, 0x48, 0x59, 0xa6, 0xb7, 0x84, 0x95, 0xe2, 0xf3, 0xc0, 0xd1, 0x23, 0x32, 0x1, 0x10, 0x67, 0x76, 0x45, 0x54, 0xab, 0xba, 0x89, 0x98, 0xef, 0xfe, 0xcd, 0xdc, 0x68, 0x79, 0x4a, 0x5b, 0x2c, 0x3d, 0xe, 0x1f, 0xe0, 0xf1, 0xc2, 0xd3, 0xa4, 0xb5, 0x86, 0x97, 0x65, 0x74, 0x47, 0x56, 0x21, 0x30, 0x3, 0x12, 0xed, 0xfc, 0xcf, 0xde, 0xa9, 0xb8, 0x8b, 0x9a, 0x72, 0x63, 0x50, 0x41, 0x36, 0x27, 0x14, 0x5, 0xfa, 0xeb, 0xd8, 0xc9, 0xbe, 0xaf, 0x9c, 0x8d, 0x7f, 0x6e, 0x5d, 0x4c, 0x3b, 0x2a, 0x19, 0x8, 0xf7, 0xe6, 0xd5, 0xc4, 0xb3, 0xa2, 0x91, 0x80, 0x5c, 0x4d, 0x7e, 0x6f, 0x18, 0x9, 0x3a, 0x2b, 0xd4, 0xc5, 0xf6, 0xe7, 0x90, 0x81, 0xb2, 0xa3, 0x51, 0x40, 0x73, 0x62, 0x15, 0x4, 0x37, 0x26, 0xd9, 0xc8, 0xfb, 0xea, 0x9d, 0x8c, 0xbf, 0xae, 0x46, 0x57, 0x64, 0x75, 0x2, 0x13, 0x20, 0x31, 0xce, 0xdf, 0xec, 0xfd, 0x8a, 0x9b, 0xa8, 0xb9, 0x4b, 0x5a, 0x69, 0x78, 0xf, 0x1e, 0x2d, 0x3c, 0xc3, 0xd2, 0xe1, 0xf0, 0x87, 0x96, 0xa5, 0xb4},
+ {0x0, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee, 0x3d, 0x2f, 0x19, 0xb, 0x75, 0x67, 0x51, 0x43, 0xad, 0xbf, 0x89, 0x9b, 0xe5, 0xf7, 0xc1, 0xd3, 0x7a, 0x68, 0x5e, 0x4c, 0x32, 0x20, 0x16, 0x4, 0xea, 0xf8, 0xce, 0xdc, 0xa2, 0xb0, 0x86, 0x94, 0x47, 0x55, 0x63, 0x71, 0xf, 0x1d, 0x2b, 0x39, 0xd7, 0xc5, 0xf3, 0xe1, 0x9f, 0x8d, 0xbb, 0xa9, 0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a, 0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x8, 0x1a, 0xc9, 0xdb, 0xed, 0xff, 0x81, 0x93, 0xa5, 0xb7, 0x59, 0x4b, 0x7d, 0x6f, 0x11, 0x3, 0x35, 0x27, 0x8e, 0x9c, 0xaa, 0xb8, 0xc6, 0xd4, 0xe2, 0xf0, 0x1e, 0xc, 0x3a, 0x28, 0x56, 0x44, 0x72, 0x60, 0xb3, 0xa1, 0x97, 0x85, 0xfb, 0xe9, 0xdf, 0xcd, 0x23, 0x31, 0x7, 0x15, 0x6b, 0x79, 0x4f, 0x5d, 0xf5, 0xe7, 0xd1, 0xc3, 0xbd, 0xaf, 0x99, 0x8b, 0x65, 0x77, 0x41, 0x53, 0x2d, 0x3f, 0x9, 0x1b, 0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6, 0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x2, 0x34, 0x26, 0x8f, 0x9d, 0xab, 0xb9, 0xc7, 0xd5, 0xe3, 0xf1, 0x1f, 0xd, 0x3b, 0x29, 0x57, 0x45, 0x73, 0x61, 0xb2, 0xa0, 0x96, 0x84, 0xfa, 0xe8, 0xde, 0xcc, 0x22, 0x30, 0x6, 0x14, 0x6a, 0x78, 0x4e, 0x5c, 0x1, 0x13, 0x25, 0x37, 0x49, 0x5b, 0x6d, 0x7f, 0x91, 0x83, 0xb5, 0xa7, 0xd9, 0xcb, 0xfd, 0xef, 0x3c, 0x2e, 0x18, 0xa, 0x74, 0x66, 0x50, 0x42, 0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2, 0x7b, 0x69, 0x5f, 0x4d, 0x33, 0x21, 0x17, 0x5, 0xeb, 0xf9, 0xcf, 0xdd, 0xa3, 0xb1, 0x87, 0x95, 0x46, 0x54, 0x62, 0x70, 0xe, 0x1c, 0x2a, 0x38, 0xd6, 0xc4, 0xf2, 0xe0, 0x9e, 0x8c, 0xba, 0xa8},
+ {0x0, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1, 0x2d, 0x3e, 0xb, 0x18, 0x61, 0x72, 0x47, 0x54, 0xb5, 0xa6, 0x93, 0x80, 0xf9, 0xea, 0xdf, 0xcc, 0x5a, 0x49, 0x7c, 0x6f, 0x16, 0x5, 0x30, 0x23, 0xc2, 0xd1, 0xe4, 0xf7, 0x8e, 0x9d, 0xa8, 0xbb, 0x77, 0x64, 0x51, 0x42, 0x3b, 0x28, 0x1d, 0xe, 0xef, 0xfc, 0xc9, 0xda, 0xa3, 0xb0, 0x85, 0x96, 0xb4, 0xa7, 0x92, 0x81, 0xf8, 0xeb, 0xde, 0xcd, 0x2c, 0x3f, 0xa, 0x19, 0x60, 0x73, 0x46, 0x55, 0x99, 0x8a, 0xbf, 0xac, 0xd5, 0xc6, 0xf3, 0xe0, 0x1, 0x12, 0x27, 0x34, 0x4d, 0x5e, 0x6b, 0x78, 0xee, 0xfd, 0xc8, 0xdb, 0xa2, 0xb1, 0x84, 0x97, 0x76, 0x65, 0x50, 0x43, 0x3a, 0x29, 0x1c, 0xf, 0xc3, 0xd0, 0xe5, 0xf6, 0x8f, 0x9c, 0xa9, 0xba, 0x5b, 0x48, 0x7d, 0x6e, 0x17, 0x4, 0x31, 0x22, 0x75, 0x66, 0x53, 0x40, 0x39, 0x2a, 0x1f, 0xc, 0xed, 0xfe, 0xcb, 0xd8, 0xa1, 0xb2, 0x87, 0x94, 0x58, 0x4b, 0x7e, 0x6d, 0x14, 0x7, 0x32, 0x21, 0xc0, 0xd3, 0xe6, 0xf5, 0x8c, 0x9f, 0xaa, 0xb9, 0x2f, 0x3c, 0x9, 0x1a, 0x63, 0x70, 0x45, 0x56, 0xb7, 0xa4, 0x91, 0x82, 0xfb, 0xe8, 0xdd, 0xce, 0x2, 0x11, 0x24, 0x37, 0x4e, 0x5d, 0x68, 0x7b, 0x9a, 0x89, 0xbc, 0xaf, 0xd6, 0xc5, 0xf0, 0xe3, 0xc1, 0xd2, 0xe7, 0xf4, 0x8d, 0x9e, 0xab, 0xb8, 0x59, 0x4a, 0x7f, 0x6c, 0x15, 0x6, 0x33, 0x20, 0xec, 0xff, 0xca, 0xd9, 0xa0, 0xb3, 0x86, 0x95, 0x74, 0x67, 0x52, 0x41, 0x38, 0x2b, 0x1e, 0xd, 0x9b, 0x88, 0xbd, 0xae, 0xd7, 0xc4, 0xf1, 0xe2, 0x3, 0x10, 0x25, 0x36, 0x4f, 0x5c, 0x69, 0x7a, 0xb6, 0xa5, 0x90, 0x83, 0xfa, 0xe9, 0xdc, 0xcf, 0x2e, 0x3d, 0x8, 0x1b, 0x62, 0x71, 0x44, 0x57},
+ {0x0, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc, 0x5d, 0x49, 0x75, 0x61, 0xd, 0x19, 0x25, 0x31, 0xfd, 0xe9, 0xd5, 0xc1, 0xad, 0xb9, 0x85, 0x91, 0xba, 0xae, 0x92, 0x86, 0xea, 0xfe, 0xc2, 0xd6, 0x1a, 0xe, 0x32, 0x26, 0x4a, 0x5e, 0x62, 0x76, 0xe7, 0xf3, 0xcf, 0xdb, 0xb7, 0xa3, 0x9f, 0x8b, 0x47, 0x53, 0x6f, 0x7b, 0x17, 0x3, 0x3f, 0x2b, 0x69, 0x7d, 0x41, 0x55, 0x39, 0x2d, 0x11, 0x5, 0xc9, 0xdd, 0xe1, 0xf5, 0x99, 0x8d, 0xb1, 0xa5, 0x34, 0x20, 0x1c, 0x8, 0x64, 0x70, 0x4c, 0x58, 0x94, 0x80, 0xbc, 0xa8, 0xc4, 0xd0, 0xec, 0xf8, 0xd3, 0xc7, 0xfb, 0xef, 0x83, 0x97, 0xab, 0xbf, 0x73, 0x67, 0x5b, 0x4f, 0x23, 0x37, 0xb, 0x1f, 0x8e, 0x9a, 0xa6, 0xb2, 0xde, 0xca, 0xf6, 0xe2, 0x2e, 0x3a, 0x6, 0x12, 0x7e, 0x6a, 0x56, 0x42, 0xd2, 0xc6, 0xfa, 0xee, 0x82, 0x96, 0xaa, 0xbe, 0x72, 0x66, 0x5a, 0x4e, 0x22, 0x36, 0xa, 0x1e, 0x8f, 0x9b, 0xa7, 0xb3, 0xdf, 0xcb, 0xf7, 0xe3, 0x2f, 0x3b, 0x7, 0x13, 0x7f, 0x6b, 0x57, 0x43, 0x68, 0x7c, 0x40, 0x54, 0x38, 0x2c, 0x10, 0x4, 0xc8, 0xdc, 0xe0, 0xf4, 0x98, 0x8c, 0xb0, 0xa4, 0x35, 0x21, 0x1d, 0x9, 0x65, 0x71, 0x4d, 0x59, 0x95, 0x81, 0xbd, 0xa9, 0xc5, 0xd1, 0xed, 0xf9, 0xbb, 0xaf, 0x93, 0x87, 0xeb, 0xff, 0xc3, 0xd7, 0x1b, 0xf, 0x33, 0x27, 0x4b, 0x5f, 0x63, 0x77, 0xe6, 0xf2, 0xce, 0xda, 0xb6, 0xa2, 0x9e, 0x8a, 0x46, 0x52, 0x6e, 0x7a, 0x16, 0x2, 0x3e, 0x2a, 0x1, 0x15, 0x29, 0x3d, 0x51, 0x45, 0x79, 0x6d, 0xa1, 0xb5, 0x89, 0x9d, 0xf1, 0xe5, 0xd9, 0xcd, 0x5c, 0x48, 0x74, 0x60, 0xc, 0x18, 0x24, 0x30, 0xfc, 0xe8, 0xd4, 0xc0, 0xac, 0xb8, 0x84, 0x90},
+ {0x0, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3, 0x4d, 0x58, 0x67, 0x72, 0x19, 0xc, 0x33, 0x26, 0xe5, 0xf0, 0xcf, 0xda, 0xb1, 0xa4, 0x9b, 0x8e, 0x9a, 0x8f, 0xb0, 0xa5, 0xce, 0xdb, 0xe4, 0xf1, 0x32, 0x27, 0x18, 0xd, 0x66, 0x73, 0x4c, 0x59, 0xd7, 0xc2, 0xfd, 0xe8, 0x83, 0x96, 0xa9, 0xbc, 0x7f, 0x6a, 0x55, 0x40, 0x2b, 0x3e, 0x1, 0x14, 0x29, 0x3c, 0x3, 0x16, 0x7d, 0x68, 0x57, 0x42, 0x81, 0x94, 0xab, 0xbe, 0xd5, 0xc0, 0xff, 0xea, 0x64, 0x71, 0x4e, 0x5b, 0x30, 0x25, 0x1a, 0xf, 0xcc, 0xd9, 0xe6, 0xf3, 0x98, 0x8d, 0xb2, 0xa7, 0xb3, 0xa6, 0x99, 0x8c, 0xe7, 0xf2, 0xcd, 0xd8, 0x1b, 0xe, 0x31, 0x24, 0x4f, 0x5a, 0x65, 0x70, 0xfe, 0xeb, 0xd4, 0xc1, 0xaa, 0xbf, 0x80, 0x95, 0x56, 0x43, 0x7c, 0x69, 0x2, 0x17, 0x28, 0x3d, 0x52, 0x47, 0x78, 0x6d, 0x6, 0x13, 0x2c, 0x39, 0xfa, 0xef, 0xd0, 0xc5, 0xae, 0xbb, 0x84, 0x91, 0x1f, 0xa, 0x35, 0x20, 0x4b, 0x5e, 0x61, 0x74, 0xb7, 0xa2, 0x9d, 0x88, 0xe3, 0xf6, 0xc9, 0xdc, 0xc8, 0xdd, 0xe2, 0xf7, 0x9c, 0x89, 0xb6, 0xa3, 0x60, 0x75, 0x4a, 0x5f, 0x34, 0x21, 0x1e, 0xb, 0x85, 0x90, 0xaf, 0xba, 0xd1, 0xc4, 0xfb, 0xee, 0x2d, 0x38, 0x7, 0x12, 0x79, 0x6c, 0x53, 0x46, 0x7b, 0x6e, 0x51, 0x44, 0x2f, 0x3a, 0x5, 0x10, 0xd3, 0xc6, 0xf9, 0xec, 0x87, 0x92, 0xad, 0xb8, 0x36, 0x23, 0x1c, 0x9, 0x62, 0x77, 0x48, 0x5d, 0x9e, 0x8b, 0xb4, 0xa1, 0xca, 0xdf, 0xe0, 0xf5, 0xe1, 0xf4, 0xcb, 0xde, 0xb5, 0xa0, 0x9f, 0x8a, 0x49, 0x5c, 0x63, 0x76, 0x1d, 0x8, 0x37, 0x22, 0xac, 0xb9, 0x86, 0x93, 0xf8, 0xed, 0xd2, 0xc7, 0x4, 0x11, 0x2e, 0x3b, 0x50, 0x45, 0x7a, 0x6f},
+ {0x0, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2, 0x7d, 0x6b, 0x51, 0x47, 0x25, 0x33, 0x9, 0x1f, 0xcd, 0xdb, 0xe1, 0xf7, 0x95, 0x83, 0xb9, 0xaf, 0xfa, 0xec, 0xd6, 0xc0, 0xa2, 0xb4, 0x8e, 0x98, 0x4a, 0x5c, 0x66, 0x70, 0x12, 0x4, 0x3e, 0x28, 0x87, 0x91, 0xab, 0xbd, 0xdf, 0xc9, 0xf3, 0xe5, 0x37, 0x21, 0x1b, 0xd, 0x6f, 0x79, 0x43, 0x55, 0xe9, 0xff, 0xc5, 0xd3, 0xb1, 0xa7, 0x9d, 0x8b, 0x59, 0x4f, 0x75, 0x63, 0x1, 0x17, 0x2d, 0x3b, 0x94, 0x82, 0xb8, 0xae, 0xcc, 0xda, 0xe0, 0xf6, 0x24, 0x32, 0x8, 0x1e, 0x7c, 0x6a, 0x50, 0x46, 0x13, 0x5, 0x3f, 0x29, 0x4b, 0x5d, 0x67, 0x71, 0xa3, 0xb5, 0x8f, 0x99, 0xfb, 0xed, 0xd7, 0xc1, 0x6e, 0x78, 0x42, 0x54, 0x36, 0x20, 0x1a, 0xc, 0xde, 0xc8, 0xf2, 0xe4, 0x86, 0x90, 0xaa, 0xbc, 0xcf, 0xd9, 0xe3, 0xf5, 0x97, 0x81, 0xbb, 0xad, 0x7f, 0x69, 0x53, 0x45, 0x27, 0x31, 0xb, 0x1d, 0xb2, 0xa4, 0x9e, 0x88, 0xea, 0xfc, 0xc6, 0xd0, 0x2, 0x14, 0x2e, 0x38, 0x5a, 0x4c, 0x76, 0x60, 0x35, 0x23, 0x19, 0xf, 0x6d, 0x7b, 0x41, 0x57, 0x85, 0x93, 0xa9, 0xbf, 0xdd, 0xcb, 0xf1, 0xe7, 0x48, 0x5e, 0x64, 0x72, 0x10, 0x6, 0x3c, 0x2a, 0xf8, 0xee, 0xd4, 0xc2, 0xa0, 0xb6, 0x8c, 0x9a, 0x26, 0x30, 0xa, 0x1c, 0x7e, 0x68, 0x52, 0x44, 0x96, 0x80, 0xba, 0xac, 0xce, 0xd8, 0xe2, 0xf4, 0x5b, 0x4d, 0x77, 0x61, 0x3, 0x15, 0x2f, 0x39, 0xeb, 0xfd, 0xc7, 0xd1, 0xb3, 0xa5, 0x9f, 0x89, 0xdc, 0xca, 0xf0, 0xe6, 0x84, 0x92, 0xa8, 0xbe, 0x6c, 0x7a, 0x40, 0x56, 0x34, 0x22, 0x18, 0xe, 0xa1, 0xb7, 0x8d, 0x9b, 0xf9, 0xef, 0xd5, 0xc3, 0x11, 0x7, 0x3d, 0x2b, 0x49, 0x5f, 0x65, 0x73},
+ {0x0, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd, 0x6d, 0x7a, 0x43, 0x54, 0x31, 0x26, 0x1f, 0x8, 0xd5, 0xc2, 0xfb, 0xec, 0x89, 0x9e, 0xa7, 0xb0, 0xda, 0xcd, 0xf4, 0xe3, 0x86, 0x91, 0xa8, 0xbf, 0x62, 0x75, 0x4c, 0x5b, 0x3e, 0x29, 0x10, 0x7, 0xb7, 0xa0, 0x99, 0x8e, 0xeb, 0xfc, 0xc5, 0xd2, 0xf, 0x18, 0x21, 0x36, 0x53, 0x44, 0x7d, 0x6a, 0xa9, 0xbe, 0x87, 0x90, 0xf5, 0xe2, 0xdb, 0xcc, 0x11, 0x6, 0x3f, 0x28, 0x4d, 0x5a, 0x63, 0x74, 0xc4, 0xd3, 0xea, 0xfd, 0x98, 0x8f, 0xb6, 0xa1, 0x7c, 0x6b, 0x52, 0x45, 0x20, 0x37, 0xe, 0x19, 0x73, 0x64, 0x5d, 0x4a, 0x2f, 0x38, 0x1, 0x16, 0xcb, 0xdc, 0xe5, 0xf2, 0x97, 0x80, 0xb9, 0xae, 0x1e, 0x9, 0x30, 0x27, 0x42, 0x55, 0x6c, 0x7b, 0xa6, 0xb1, 0x88, 0x9f, 0xfa, 0xed, 0xd4, 0xc3, 0x4f, 0x58, 0x61, 0x76, 0x13, 0x4, 0x3d, 0x2a, 0xf7, 0xe0, 0xd9, 0xce, 0xab, 0xbc, 0x85, 0x92, 0x22, 0x35, 0xc, 0x1b, 0x7e, 0x69, 0x50, 0x47, 0x9a, 0x8d, 0xb4, 0xa3, 0xc6, 0xd1, 0xe8, 0xff, 0x95, 0x82, 0xbb, 0xac, 0xc9, 0xde, 0xe7, 0xf0, 0x2d, 0x3a, 0x3, 0x14, 0x71, 0x66, 0x5f, 0x48, 0xf8, 0xef, 0xd6, 0xc1, 0xa4, 0xb3, 0x8a, 0x9d, 0x40, 0x57, 0x6e, 0x79, 0x1c, 0xb, 0x32, 0x25, 0xe6, 0xf1, 0xc8, 0xdf, 0xba, 0xad, 0x94, 0x83, 0x5e, 0x49, 0x70, 0x67, 0x2, 0x15, 0x2c, 0x3b, 0x8b, 0x9c, 0xa5, 0xb2, 0xd7, 0xc0, 0xf9, 0xee, 0x33, 0x24, 0x1d, 0xa, 0x6f, 0x78, 0x41, 0x56, 0x3c, 0x2b, 0x12, 0x5, 0x60, 0x77, 0x4e, 0x59, 0x84, 0x93, 0xaa, 0xbd, 0xd8, 0xcf, 0xf6, 0xe1, 0x51, 0x46, 0x7f, 0x68, 0xd, 0x1a, 0x23, 0x34, 0xe9, 0xfe, 0xc7, 0xd0, 0xb5, 0xa2, 0x9b, 0x8c},
+ {0x0, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88, 0x9d, 0x85, 0xad, 0xb5, 0xfd, 0xe5, 0xcd, 0xd5, 0x5d, 0x45, 0x6d, 0x75, 0x3d, 0x25, 0xd, 0x15, 0x27, 0x3f, 0x17, 0xf, 0x47, 0x5f, 0x77, 0x6f, 0xe7, 0xff, 0xd7, 0xcf, 0x87, 0x9f, 0xb7, 0xaf, 0xba, 0xa2, 0x8a, 0x92, 0xda, 0xc2, 0xea, 0xf2, 0x7a, 0x62, 0x4a, 0x52, 0x1a, 0x2, 0x2a, 0x32, 0x4e, 0x56, 0x7e, 0x66, 0x2e, 0x36, 0x1e, 0x6, 0x8e, 0x96, 0xbe, 0xa6, 0xee, 0xf6, 0xde, 0xc6, 0xd3, 0xcb, 0xe3, 0xfb, 0xb3, 0xab, 0x83, 0x9b, 0x13, 0xb, 0x23, 0x3b, 0x73, 0x6b, 0x43, 0x5b, 0x69, 0x71, 0x59, 0x41, 0x9, 0x11, 0x39, 0x21, 0xa9, 0xb1, 0x99, 0x81, 0xc9, 0xd1, 0xf9, 0xe1, 0xf4, 0xec, 0xc4, 0xdc, 0x94, 0x8c, 0xa4, 0xbc, 0x34, 0x2c, 0x4, 0x1c, 0x54, 0x4c, 0x64, 0x7c, 0x9c, 0x84, 0xac, 0xb4, 0xfc, 0xe4, 0xcc, 0xd4, 0x5c, 0x44, 0x6c, 0x74, 0x3c, 0x24, 0xc, 0x14, 0x1, 0x19, 0x31, 0x29, 0x61, 0x79, 0x51, 0x49, 0xc1, 0xd9, 0xf1, 0xe9, 0xa1, 0xb9, 0x91, 0x89, 0xbb, 0xa3, 0x8b, 0x93, 0xdb, 0xc3, 0xeb, 0xf3, 0x7b, 0x63, 0x4b, 0x53, 0x1b, 0x3, 0x2b, 0x33, 0x26, 0x3e, 0x16, 0xe, 0x46, 0x5e, 0x76, 0x6e, 0xe6, 0xfe, 0xd6, 0xce, 0x86, 0x9e, 0xb6, 0xae, 0xd2, 0xca, 0xe2, 0xfa, 0xb2, 0xaa, 0x82, 0x9a, 0x12, 0xa, 0x22, 0x3a, 0x72, 0x6a, 0x42, 0x5a, 0x4f, 0x57, 0x7f, 0x67, 0x2f, 0x37, 0x1f, 0x7, 0x8f, 0x97, 0xbf, 0xa7, 0xef, 0xf7, 0xdf, 0xc7, 0xf5, 0xed, 0xc5, 0xdd, 0x95, 0x8d, 0xa5, 0xbd, 0x35, 0x2d, 0x5, 0x1d, 0x55, 0x4d, 0x65, 0x7d, 0x68, 0x70, 0x58, 0x40, 0x8, 0x10, 0x38, 0x20, 0xa8, 0xb0, 0x98, 0x80, 0xc8, 0xd0, 0xf8, 0xe0},
+ {0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87, 0x8d, 0x94, 0xbf, 0xa6, 0xe9, 0xf0, 0xdb, 0xc2, 0x45, 0x5c, 0x77, 0x6e, 0x21, 0x38, 0x13, 0xa, 0x7, 0x1e, 0x35, 0x2c, 0x63, 0x7a, 0x51, 0x48, 0xcf, 0xd6, 0xfd, 0xe4, 0xab, 0xb2, 0x99, 0x80, 0x8a, 0x93, 0xb8, 0xa1, 0xee, 0xf7, 0xdc, 0xc5, 0x42, 0x5b, 0x70, 0x69, 0x26, 0x3f, 0x14, 0xd, 0xe, 0x17, 0x3c, 0x25, 0x6a, 0x73, 0x58, 0x41, 0xc6, 0xdf, 0xf4, 0xed, 0xa2, 0xbb, 0x90, 0x89, 0x83, 0x9a, 0xb1, 0xa8, 0xe7, 0xfe, 0xd5, 0xcc, 0x4b, 0x52, 0x79, 0x60, 0x2f, 0x36, 0x1d, 0x4, 0x9, 0x10, 0x3b, 0x22, 0x6d, 0x74, 0x5f, 0x46, 0xc1, 0xd8, 0xf3, 0xea, 0xa5, 0xbc, 0x97, 0x8e, 0x84, 0x9d, 0xb6, 0xaf, 0xe0, 0xf9, 0xd2, 0xcb, 0x4c, 0x55, 0x7e, 0x67, 0x28, 0x31, 0x1a, 0x3, 0x1c, 0x5, 0x2e, 0x37, 0x78, 0x61, 0x4a, 0x53, 0xd4, 0xcd, 0xe6, 0xff, 0xb0, 0xa9, 0x82, 0x9b, 0x91, 0x88, 0xa3, 0xba, 0xf5, 0xec, 0xc7, 0xde, 0x59, 0x40, 0x6b, 0x72, 0x3d, 0x24, 0xf, 0x16, 0x1b, 0x2, 0x29, 0x30, 0x7f, 0x66, 0x4d, 0x54, 0xd3, 0xca, 0xe1, 0xf8, 0xb7, 0xae, 0x85, 0x9c, 0x96, 0x8f, 0xa4, 0xbd, 0xf2, 0xeb, 0xc0, 0xd9, 0x5e, 0x47, 0x6c, 0x75, 0x3a, 0x23, 0x8, 0x11, 0x12, 0xb, 0x20, 0x39, 0x76, 0x6f, 0x44, 0x5d, 0xda, 0xc3, 0xe8, 0xf1, 0xbe, 0xa7, 0x8c, 0x95, 0x9f, 0x86, 0xad, 0xb4, 0xfb, 0xe2, 0xc9, 0xd0, 0x57, 0x4e, 0x65, 0x7c, 0x33, 0x2a, 0x1, 0x18, 0x15, 0xc, 0x27, 0x3e, 0x71, 0x68, 0x43, 0x5a, 0xdd, 0xc4, 0xef, 0xf6, 0xb9, 0xa0, 0x8b, 0x92, 0x98, 0x81, 0xaa, 0xb3, 0xfc, 0xe5, 0xce, 0xd7, 0x50, 0x49, 0x62, 0x7b, 0x34, 0x2d, 0x6, 0x1f},
+ {0x0, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96, 0xbd, 0xa7, 0x89, 0x93, 0xd5, 0xcf, 0xe1, 0xfb, 0x6d, 0x77, 0x59, 0x43, 0x5, 0x1f, 0x31, 0x2b, 0x67, 0x7d, 0x53, 0x49, 0xf, 0x15, 0x3b, 0x21, 0xb7, 0xad, 0x83, 0x99, 0xdf, 0xc5, 0xeb, 0xf1, 0xda, 0xc0, 0xee, 0xf4, 0xb2, 0xa8, 0x86, 0x9c, 0xa, 0x10, 0x3e, 0x24, 0x62, 0x78, 0x56, 0x4c, 0xce, 0xd4, 0xfa, 0xe0, 0xa6, 0xbc, 0x92, 0x88, 0x1e, 0x4, 0x2a, 0x30, 0x76, 0x6c, 0x42, 0x58, 0x73, 0x69, 0x47, 0x5d, 0x1b, 0x1, 0x2f, 0x35, 0xa3, 0xb9, 0x97, 0x8d, 0xcb, 0xd1, 0xff, 0xe5, 0xa9, 0xb3, 0x9d, 0x87, 0xc1, 0xdb, 0xf5, 0xef, 0x79, 0x63, 0x4d, 0x57, 0x11, 0xb, 0x25, 0x3f, 0x14, 0xe, 0x20, 0x3a, 0x7c, 0x66, 0x48, 0x52, 0xc4, 0xde, 0xf0, 0xea, 0xac, 0xb6, 0x98, 0x82, 0x81, 0x9b, 0xb5, 0xaf, 0xe9, 0xf3, 0xdd, 0xc7, 0x51, 0x4b, 0x65, 0x7f, 0x39, 0x23, 0xd, 0x17, 0x3c, 0x26, 0x8, 0x12, 0x54, 0x4e, 0x60, 0x7a, 0xec, 0xf6, 0xd8, 0xc2, 0x84, 0x9e, 0xb0, 0xaa, 0xe6, 0xfc, 0xd2, 0xc8, 0x8e, 0x94, 0xba, 0xa0, 0x36, 0x2c, 0x2, 0x18, 0x5e, 0x44, 0x6a, 0x70, 0x5b, 0x41, 0x6f, 0x75, 0x33, 0x29, 0x7, 0x1d, 0x8b, 0x91, 0xbf, 0xa5, 0xe3, 0xf9, 0xd7, 0xcd, 0x4f, 0x55, 0x7b, 0x61, 0x27, 0x3d, 0x13, 0x9, 0x9f, 0x85, 0xab, 0xb1, 0xf7, 0xed, 0xc3, 0xd9, 0xf2, 0xe8, 0xc6, 0xdc, 0x9a, 0x80, 0xae, 0xb4, 0x22, 0x38, 0x16, 0xc, 0x4a, 0x50, 0x7e, 0x64, 0x28, 0x32, 0x1c, 0x6, 0x40, 0x5a, 0x74, 0x6e, 0xf8, 0xe2, 0xcc, 0xd6, 0x90, 0x8a, 0xa4, 0xbe, 0x95, 0x8f, 0xa1, 0xbb, 0xfd, 0xe7, 0xc9, 0xd3, 0x45, 0x5f, 0x71, 0x6b, 0x2d, 0x37, 0x19, 0x3},
+ {0x0, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99, 0xad, 0xb6, 0x9b, 0x80, 0xc1, 0xda, 0xf7, 0xec, 0x75, 0x6e, 0x43, 0x58, 0x19, 0x2, 0x2f, 0x34, 0x47, 0x5c, 0x71, 0x6a, 0x2b, 0x30, 0x1d, 0x6, 0x9f, 0x84, 0xa9, 0xb2, 0xf3, 0xe8, 0xc5, 0xde, 0xea, 0xf1, 0xdc, 0xc7, 0x86, 0x9d, 0xb0, 0xab, 0x32, 0x29, 0x4, 0x1f, 0x5e, 0x45, 0x68, 0x73, 0x8e, 0x95, 0xb8, 0xa3, 0xe2, 0xf9, 0xd4, 0xcf, 0x56, 0x4d, 0x60, 0x7b, 0x3a, 0x21, 0xc, 0x17, 0x23, 0x38, 0x15, 0xe, 0x4f, 0x54, 0x79, 0x62, 0xfb, 0xe0, 0xcd, 0xd6, 0x97, 0x8c, 0xa1, 0xba, 0xc9, 0xd2, 0xff, 0xe4, 0xa5, 0xbe, 0x93, 0x88, 0x11, 0xa, 0x27, 0x3c, 0x7d, 0x66, 0x4b, 0x50, 0x64, 0x7f, 0x52, 0x49, 0x8, 0x13, 0x3e, 0x25, 0xbc, 0xa7, 0x8a, 0x91, 0xd0, 0xcb, 0xe6, 0xfd, 0x1, 0x1a, 0x37, 0x2c, 0x6d, 0x76, 0x5b, 0x40, 0xd9, 0xc2, 0xef, 0xf4, 0xb5, 0xae, 0x83, 0x98, 0xac, 0xb7, 0x9a, 0x81, 0xc0, 0xdb, 0xf6, 0xed, 0x74, 0x6f, 0x42, 0x59, 0x18, 0x3, 0x2e, 0x35, 0x46, 0x5d, 0x70, 0x6b, 0x2a, 0x31, 0x1c, 0x7, 0x9e, 0x85, 0xa8, 0xb3, 0xf2, 0xe9, 0xc4, 0xdf, 0xeb, 0xf0, 0xdd, 0xc6, 0x87, 0x9c, 0xb1, 0xaa, 0x33, 0x28, 0x5, 0x1e, 0x5f, 0x44, 0x69, 0x72, 0x8f, 0x94, 0xb9, 0xa2, 0xe3, 0xf8, 0xd5, 0xce, 0x57, 0x4c, 0x61, 0x7a, 0x3b, 0x20, 0xd, 0x16, 0x22, 0x39, 0x14, 0xf, 0x4e, 0x55, 0x78, 0x63, 0xfa, 0xe1, 0xcc, 0xd7, 0x96, 0x8d, 0xa0, 0xbb, 0xc8, 0xd3, 0xfe, 0xe5, 0xa4, 0xbf, 0x92, 0x89, 0x10, 0xb, 0x26, 0x3d, 0x7c, 0x67, 0x4a, 0x51, 0x65, 0x7e, 0x53, 0x48, 0x9, 0x12, 0x3f, 0x24, 0xbd, 0xa6, 0x8b, 0x90, 0xd1, 0xca, 0xe7, 0xfc},
+ {0x0, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4, 0xdd, 0xc1, 0xe5, 0xf9, 0xad, 0xb1, 0x95, 0x89, 0x3d, 0x21, 0x5, 0x19, 0x4d, 0x51, 0x75, 0x69, 0xa7, 0xbb, 0x9f, 0x83, 0xd7, 0xcb, 0xef, 0xf3, 0x47, 0x5b, 0x7f, 0x63, 0x37, 0x2b, 0xf, 0x13, 0x7a, 0x66, 0x42, 0x5e, 0xa, 0x16, 0x32, 0x2e, 0x9a, 0x86, 0xa2, 0xbe, 0xea, 0xf6, 0xd2, 0xce, 0x53, 0x4f, 0x6b, 0x77, 0x23, 0x3f, 0x1b, 0x7, 0xb3, 0xaf, 0x8b, 0x97, 0xc3, 0xdf, 0xfb, 0xe7, 0x8e, 0x92, 0xb6, 0xaa, 0xfe, 0xe2, 0xc6, 0xda, 0x6e, 0x72, 0x56, 0x4a, 0x1e, 0x2, 0x26, 0x3a, 0xf4, 0xe8, 0xcc, 0xd0, 0x84, 0x98, 0xbc, 0xa0, 0x14, 0x8, 0x2c, 0x30, 0x64, 0x78, 0x5c, 0x40, 0x29, 0x35, 0x11, 0xd, 0x59, 0x45, 0x61, 0x7d, 0xc9, 0xd5, 0xf1, 0xed, 0xb9, 0xa5, 0x81, 0x9d, 0xa6, 0xba, 0x9e, 0x82, 0xd6, 0xca, 0xee, 0xf2, 0x46, 0x5a, 0x7e, 0x62, 0x36, 0x2a, 0xe, 0x12, 0x7b, 0x67, 0x43, 0x5f, 0xb, 0x17, 0x33, 0x2f, 0x9b, 0x87, 0xa3, 0xbf, 0xeb, 0xf7, 0xd3, 0xcf, 0x1, 0x1d, 0x39, 0x25, 0x71, 0x6d, 0x49, 0x55, 0xe1, 0xfd, 0xd9, 0xc5, 0x91, 0x8d, 0xa9, 0xb5, 0xdc, 0xc0, 0xe4, 0xf8, 0xac, 0xb0, 0x94, 0x88, 0x3c, 0x20, 0x4, 0x18, 0x4c, 0x50, 0x74, 0x68, 0xf5, 0xe9, 0xcd, 0xd1, 0x85, 0x99, 0xbd, 0xa1, 0x15, 0x9, 0x2d, 0x31, 0x65, 0x79, 0x5d, 0x41, 0x28, 0x34, 0x10, 0xc, 0x58, 0x44, 0x60, 0x7c, 0xc8, 0xd4, 0xf0, 0xec, 0xb8, 0xa4, 0x80, 0x9c, 0x52, 0x4e, 0x6a, 0x76, 0x22, 0x3e, 0x1a, 0x6, 0xb2, 0xae, 0x8a, 0x96, 0xc2, 0xde, 0xfa, 0xe6, 0x8f, 0x93, 0xb7, 0xab, 0xff, 0xe3, 0xc7, 0xdb, 0x6f, 0x73, 0x57, 0x4b, 0x1f, 0x3, 0x27, 0x3b},
+ {0x0, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb, 0xcd, 0xd0, 0xf7, 0xea, 0xb9, 0xa4, 0x83, 0x9e, 0x25, 0x38, 0x1f, 0x2, 0x51, 0x4c, 0x6b, 0x76, 0x87, 0x9a, 0xbd, 0xa0, 0xf3, 0xee, 0xc9, 0xd4, 0x6f, 0x72, 0x55, 0x48, 0x1b, 0x6, 0x21, 0x3c, 0x4a, 0x57, 0x70, 0x6d, 0x3e, 0x23, 0x4, 0x19, 0xa2, 0xbf, 0x98, 0x85, 0xd6, 0xcb, 0xec, 0xf1, 0x13, 0xe, 0x29, 0x34, 0x67, 0x7a, 0x5d, 0x40, 0xfb, 0xe6, 0xc1, 0xdc, 0x8f, 0x92, 0xb5, 0xa8, 0xde, 0xc3, 0xe4, 0xf9, 0xaa, 0xb7, 0x90, 0x8d, 0x36, 0x2b, 0xc, 0x11, 0x42, 0x5f, 0x78, 0x65, 0x94, 0x89, 0xae, 0xb3, 0xe0, 0xfd, 0xda, 0xc7, 0x7c, 0x61, 0x46, 0x5b, 0x8, 0x15, 0x32, 0x2f, 0x59, 0x44, 0x63, 0x7e, 0x2d, 0x30, 0x17, 0xa, 0xb1, 0xac, 0x8b, 0x96, 0xc5, 0xd8, 0xff, 0xe2, 0x26, 0x3b, 0x1c, 0x1, 0x52, 0x4f, 0x68, 0x75, 0xce, 0xd3, 0xf4, 0xe9, 0xba, 0xa7, 0x80, 0x9d, 0xeb, 0xf6, 0xd1, 0xcc, 0x9f, 0x82, 0xa5, 0xb8, 0x3, 0x1e, 0x39, 0x24, 0x77, 0x6a, 0x4d, 0x50, 0xa1, 0xbc, 0x9b, 0x86, 0xd5, 0xc8, 0xef, 0xf2, 0x49, 0x54, 0x73, 0x6e, 0x3d, 0x20, 0x7, 0x1a, 0x6c, 0x71, 0x56, 0x4b, 0x18, 0x5, 0x22, 0x3f, 0x84, 0x99, 0xbe, 0xa3, 0xf0, 0xed, 0xca, 0xd7, 0x35, 0x28, 0xf, 0x12, 0x41, 0x5c, 0x7b, 0x66, 0xdd, 0xc0, 0xe7, 0xfa, 0xa9, 0xb4, 0x93, 0x8e, 0xf8, 0xe5, 0xc2, 0xdf, 0x8c, 0x91, 0xb6, 0xab, 0x10, 0xd, 0x2a, 0x37, 0x64, 0x79, 0x5e, 0x43, 0xb2, 0xaf, 0x88, 0x95, 0xc6, 0xdb, 0xfc, 0xe1, 0x5a, 0x47, 0x60, 0x7d, 0x2e, 0x33, 0x14, 0x9, 0x7f, 0x62, 0x45, 0x58, 0xb, 0x16, 0x31, 0x2c, 0x97, 0x8a, 0xad, 0xb0, 0xe3, 0xfe, 0xd9, 0xc4},
+ {0x0, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa, 0xfd, 0xe3, 0xc1, 0xdf, 0x85, 0x9b, 0xb9, 0xa7, 0xd, 0x13, 0x31, 0x2f, 0x75, 0x6b, 0x49, 0x57, 0xe7, 0xf9, 0xdb, 0xc5, 0x9f, 0x81, 0xa3, 0xbd, 0x17, 0x9, 0x2b, 0x35, 0x6f, 0x71, 0x53, 0x4d, 0x1a, 0x4, 0x26, 0x38, 0x62, 0x7c, 0x5e, 0x40, 0xea, 0xf4, 0xd6, 0xc8, 0x92, 0x8c, 0xae, 0xb0, 0xd3, 0xcd, 0xef, 0xf1, 0xab, 0xb5, 0x97, 0x89, 0x23, 0x3d, 0x1f, 0x1, 0x5b, 0x45, 0x67, 0x79, 0x2e, 0x30, 0x12, 0xc, 0x56, 0x48, 0x6a, 0x74, 0xde, 0xc0, 0xe2, 0xfc, 0xa6, 0xb8, 0x9a, 0x84, 0x34, 0x2a, 0x8, 0x16, 0x4c, 0x52, 0x70, 0x6e, 0xc4, 0xda, 0xf8, 0xe6, 0xbc, 0xa2, 0x80, 0x9e, 0xc9, 0xd7, 0xf5, 0xeb, 0xb1, 0xaf, 0x8d, 0x93, 0x39, 0x27, 0x5, 0x1b, 0x41, 0x5f, 0x7d, 0x63, 0xbb, 0xa5, 0x87, 0x99, 0xc3, 0xdd, 0xff, 0xe1, 0x4b, 0x55, 0x77, 0x69, 0x33, 0x2d, 0xf, 0x11, 0x46, 0x58, 0x7a, 0x64, 0x3e, 0x20, 0x2, 0x1c, 0xb6, 0xa8, 0x8a, 0x94, 0xce, 0xd0, 0xf2, 0xec, 0x5c, 0x42, 0x60, 0x7e, 0x24, 0x3a, 0x18, 0x6, 0xac, 0xb2, 0x90, 0x8e, 0xd4, 0xca, 0xe8, 0xf6, 0xa1, 0xbf, 0x9d, 0x83, 0xd9, 0xc7, 0xe5, 0xfb, 0x51, 0x4f, 0x6d, 0x73, 0x29, 0x37, 0x15, 0xb, 0x68, 0x76, 0x54, 0x4a, 0x10, 0xe, 0x2c, 0x32, 0x98, 0x86, 0xa4, 0xba, 0xe0, 0xfe, 0xdc, 0xc2, 0x95, 0x8b, 0xa9, 0xb7, 0xed, 0xf3, 0xd1, 0xcf, 0x65, 0x7b, 0x59, 0x47, 0x1d, 0x3, 0x21, 0x3f, 0x8f, 0x91, 0xb3, 0xad, 0xf7, 0xe9, 0xcb, 0xd5, 0x7f, 0x61, 0x43, 0x5d, 0x7, 0x19, 0x3b, 0x25, 0x72, 0x6c, 0x4e, 0x50, 0xa, 0x14, 0x36, 0x28, 0x82, 0x9c, 0xbe, 0xa0, 0xfa, 0xe4, 0xc6, 0xd8},
+ {0x0, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5, 0xed, 0xf2, 0xd3, 0xcc, 0x91, 0x8e, 0xaf, 0xb0, 0x15, 0xa, 0x2b, 0x34, 0x69, 0x76, 0x57, 0x48, 0xc7, 0xd8, 0xf9, 0xe6, 0xbb, 0xa4, 0x85, 0x9a, 0x3f, 0x20, 0x1, 0x1e, 0x43, 0x5c, 0x7d, 0x62, 0x2a, 0x35, 0x14, 0xb, 0x56, 0x49, 0x68, 0x77, 0xd2, 0xcd, 0xec, 0xf3, 0xae, 0xb1, 0x90, 0x8f, 0x93, 0x8c, 0xad, 0xb2, 0xef, 0xf0, 0xd1, 0xce, 0x6b, 0x74, 0x55, 0x4a, 0x17, 0x8, 0x29, 0x36, 0x7e, 0x61, 0x40, 0x5f, 0x2, 0x1d, 0x3c, 0x23, 0x86, 0x99, 0xb8, 0xa7, 0xfa, 0xe5, 0xc4, 0xdb, 0x54, 0x4b, 0x6a, 0x75, 0x28, 0x37, 0x16, 0x9, 0xac, 0xb3, 0x92, 0x8d, 0xd0, 0xcf, 0xee, 0xf1, 0xb9, 0xa6, 0x87, 0x98, 0xc5, 0xda, 0xfb, 0xe4, 0x41, 0x5e, 0x7f, 0x60, 0x3d, 0x22, 0x3, 0x1c, 0x3b, 0x24, 0x5, 0x1a, 0x47, 0x58, 0x79, 0x66, 0xc3, 0xdc, 0xfd, 0xe2, 0xbf, 0xa0, 0x81, 0x9e, 0xd6, 0xc9, 0xe8, 0xf7, 0xaa, 0xb5, 0x94, 0x8b, 0x2e, 0x31, 0x10, 0xf, 0x52, 0x4d, 0x6c, 0x73, 0xfc, 0xe3, 0xc2, 0xdd, 0x80, 0x9f, 0xbe, 0xa1, 0x4, 0x1b, 0x3a, 0x25, 0x78, 0x67, 0x46, 0x59, 0x11, 0xe, 0x2f, 0x30, 0x6d, 0x72, 0x53, 0x4c, 0xe9, 0xf6, 0xd7, 0xc8, 0x95, 0x8a, 0xab, 0xb4, 0xa8, 0xb7, 0x96, 0x89, 0xd4, 0xcb, 0xea, 0xf5, 0x50, 0x4f, 0x6e, 0x71, 0x2c, 0x33, 0x12, 0xd, 0x45, 0x5a, 0x7b, 0x64, 0x39, 0x26, 0x7, 0x18, 0xbd, 0xa2, 0x83, 0x9c, 0xc1, 0xde, 0xff, 0xe0, 0x6f, 0x70, 0x51, 0x4e, 0x13, 0xc, 0x2d, 0x32, 0x97, 0x88, 0xa9, 0xb6, 0xeb, 0xf4, 0xd5, 0xca, 0x82, 0x9d, 0xbc, 0xa3, 0xfe, 0xe1, 0xc0, 0xdf, 0x7a, 0x65, 0x44, 0x5b, 0x6, 0x19, 0x38, 0x27},
+ {0x0, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd, 0x3a, 0x1a, 0x7a, 0x5a, 0xba, 0x9a, 0xfa, 0xda, 0x27, 0x7, 0x67, 0x47, 0xa7, 0x87, 0xe7, 0xc7, 0x74, 0x54, 0x34, 0x14, 0xf4, 0xd4, 0xb4, 0x94, 0x69, 0x49, 0x29, 0x9, 0xe9, 0xc9, 0xa9, 0x89, 0x4e, 0x6e, 0xe, 0x2e, 0xce, 0xee, 0x8e, 0xae, 0x53, 0x73, 0x13, 0x33, 0xd3, 0xf3, 0x93, 0xb3, 0xe8, 0xc8, 0xa8, 0x88, 0x68, 0x48, 0x28, 0x8, 0xf5, 0xd5, 0xb5, 0x95, 0x75, 0x55, 0x35, 0x15, 0xd2, 0xf2, 0x92, 0xb2, 0x52, 0x72, 0x12, 0x32, 0xcf, 0xef, 0x8f, 0xaf, 0x4f, 0x6f, 0xf, 0x2f, 0x9c, 0xbc, 0xdc, 0xfc, 0x1c, 0x3c, 0x5c, 0x7c, 0x81, 0xa1, 0xc1, 0xe1, 0x1, 0x21, 0x41, 0x61, 0xa6, 0x86, 0xe6, 0xc6, 0x26, 0x6, 0x66, 0x46, 0xbb, 0x9b, 0xfb, 0xdb, 0x3b, 0x1b, 0x7b, 0x5b, 0xcd, 0xed, 0x8d, 0xad, 0x4d, 0x6d, 0xd, 0x2d, 0xd0, 0xf0, 0x90, 0xb0, 0x50, 0x70, 0x10, 0x30, 0xf7, 0xd7, 0xb7, 0x97, 0x77, 0x57, 0x37, 0x17, 0xea, 0xca, 0xaa, 0x8a, 0x6a, 0x4a, 0x2a, 0xa, 0xb9, 0x99, 0xf9, 0xd9, 0x39, 0x19, 0x79, 0x59, 0xa4, 0x84, 0xe4, 0xc4, 0x24, 0x4, 0x64, 0x44, 0x83, 0xa3, 0xc3, 0xe3, 0x3, 0x23, 0x43, 0x63, 0x9e, 0xbe, 0xde, 0xfe, 0x1e, 0x3e, 0x5e, 0x7e, 0x25, 0x5, 0x65, 0x45, 0xa5, 0x85, 0xe5, 0xc5, 0x38, 0x18, 0x78, 0x58, 0xb8, 0x98, 0xf8, 0xd8, 0x1f, 0x3f, 0x5f, 0x7f, 0x9f, 0xbf, 0xdf, 0xff, 0x2, 0x22, 0x42, 0x62, 0x82, 0xa2, 0xc2, 0xe2, 0x51, 0x71, 0x11, 0x31, 0xd1, 0xf1, 0x91, 0xb1, 0x4c, 0x6c, 0xc, 0x2c, 0xcc, 0xec, 0x8c, 0xac, 0x6b, 0x4b, 0x2b, 0xb, 0xeb, 0xcb, 0xab, 0x8b, 0x76, 0x56, 0x36, 0x16, 0xf6, 0xd6, 0xb6, 0x96},
+ {0x0, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2, 0x2a, 0xb, 0x68, 0x49, 0xae, 0x8f, 0xec, 0xcd, 0x3f, 0x1e, 0x7d, 0x5c, 0xbb, 0x9a, 0xf9, 0xd8, 0x54, 0x75, 0x16, 0x37, 0xd0, 0xf1, 0x92, 0xb3, 0x41, 0x60, 0x3, 0x22, 0xc5, 0xe4, 0x87, 0xa6, 0x7e, 0x5f, 0x3c, 0x1d, 0xfa, 0xdb, 0xb8, 0x99, 0x6b, 0x4a, 0x29, 0x8, 0xef, 0xce, 0xad, 0x8c, 0xa8, 0x89, 0xea, 0xcb, 0x2c, 0xd, 0x6e, 0x4f, 0xbd, 0x9c, 0xff, 0xde, 0x39, 0x18, 0x7b, 0x5a, 0x82, 0xa3, 0xc0, 0xe1, 0x6, 0x27, 0x44, 0x65, 0x97, 0xb6, 0xd5, 0xf4, 0x13, 0x32, 0x51, 0x70, 0xfc, 0xdd, 0xbe, 0x9f, 0x78, 0x59, 0x3a, 0x1b, 0xe9, 0xc8, 0xab, 0x8a, 0x6d, 0x4c, 0x2f, 0xe, 0xd6, 0xf7, 0x94, 0xb5, 0x52, 0x73, 0x10, 0x31, 0xc3, 0xe2, 0x81, 0xa0, 0x47, 0x66, 0x5, 0x24, 0x4d, 0x6c, 0xf, 0x2e, 0xc9, 0xe8, 0x8b, 0xaa, 0x58, 0x79, 0x1a, 0x3b, 0xdc, 0xfd, 0x9e, 0xbf, 0x67, 0x46, 0x25, 0x4, 0xe3, 0xc2, 0xa1, 0x80, 0x72, 0x53, 0x30, 0x11, 0xf6, 0xd7, 0xb4, 0x95, 0x19, 0x38, 0x5b, 0x7a, 0x9d, 0xbc, 0xdf, 0xfe, 0xc, 0x2d, 0x4e, 0x6f, 0x88, 0xa9, 0xca, 0xeb, 0x33, 0x12, 0x71, 0x50, 0xb7, 0x96, 0xf5, 0xd4, 0x26, 0x7, 0x64, 0x45, 0xa2, 0x83, 0xe0, 0xc1, 0xe5, 0xc4, 0xa7, 0x86, 0x61, 0x40, 0x23, 0x2, 0xf0, 0xd1, 0xb2, 0x93, 0x74, 0x55, 0x36, 0x17, 0xcf, 0xee, 0x8d, 0xac, 0x4b, 0x6a, 0x9, 0x28, 0xda, 0xfb, 0x98, 0xb9, 0x5e, 0x7f, 0x1c, 0x3d, 0xb1, 0x90, 0xf3, 0xd2, 0x35, 0x14, 0x77, 0x56, 0xa4, 0x85, 0xe6, 0xc7, 0x20, 0x1, 0x62, 0x43, 0x9b, 0xba, 0xd9, 0xf8, 0x1f, 0x3e, 0x5d, 0x7c, 0x8e, 0xaf, 0xcc, 0xed, 0xa, 0x2b, 0x48, 0x69},
+ {0x0, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0xd, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3, 0x1a, 0x38, 0x5e, 0x7c, 0x92, 0xb0, 0xd6, 0xf4, 0x17, 0x35, 0x53, 0x71, 0x9f, 0xbd, 0xdb, 0xf9, 0x34, 0x16, 0x70, 0x52, 0xbc, 0x9e, 0xf8, 0xda, 0x39, 0x1b, 0x7d, 0x5f, 0xb1, 0x93, 0xf5, 0xd7, 0x2e, 0xc, 0x6a, 0x48, 0xa6, 0x84, 0xe2, 0xc0, 0x23, 0x1, 0x67, 0x45, 0xab, 0x89, 0xef, 0xcd, 0x68, 0x4a, 0x2c, 0xe, 0xe0, 0xc2, 0xa4, 0x86, 0x65, 0x47, 0x21, 0x3, 0xed, 0xcf, 0xa9, 0x8b, 0x72, 0x50, 0x36, 0x14, 0xfa, 0xd8, 0xbe, 0x9c, 0x7f, 0x5d, 0x3b, 0x19, 0xf7, 0xd5, 0xb3, 0x91, 0x5c, 0x7e, 0x18, 0x3a, 0xd4, 0xf6, 0x90, 0xb2, 0x51, 0x73, 0x15, 0x37, 0xd9, 0xfb, 0x9d, 0xbf, 0x46, 0x64, 0x2, 0x20, 0xce, 0xec, 0x8a, 0xa8, 0x4b, 0x69, 0xf, 0x2d, 0xc3, 0xe1, 0x87, 0xa5, 0xd0, 0xf2, 0x94, 0xb6, 0x58, 0x7a, 0x1c, 0x3e, 0xdd, 0xff, 0x99, 0xbb, 0x55, 0x77, 0x11, 0x33, 0xca, 0xe8, 0x8e, 0xac, 0x42, 0x60, 0x6, 0x24, 0xc7, 0xe5, 0x83, 0xa1, 0x4f, 0x6d, 0xb, 0x29, 0xe4, 0xc6, 0xa0, 0x82, 0x6c, 0x4e, 0x28, 0xa, 0xe9, 0xcb, 0xad, 0x8f, 0x61, 0x43, 0x25, 0x7, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0xf3, 0xd1, 0xb7, 0x95, 0x7b, 0x59, 0x3f, 0x1d, 0xb8, 0x9a, 0xfc, 0xde, 0x30, 0x12, 0x74, 0x56, 0xb5, 0x97, 0xf1, 0xd3, 0x3d, 0x1f, 0x79, 0x5b, 0xa2, 0x80, 0xe6, 0xc4, 0x2a, 0x8, 0x6e, 0x4c, 0xaf, 0x8d, 0xeb, 0xc9, 0x27, 0x5, 0x63, 0x41, 0x8c, 0xae, 0xc8, 0xea, 0x4, 0x26, 0x40, 0x62, 0x81, 0xa3, 0xc5, 0xe7, 0x9, 0x2b, 0x4d, 0x6f, 0x96, 0xb4, 0xd2, 0xf0, 0x1e, 0x3c, 0x5a, 0x78, 0x9b, 0xb9, 0xdf, 0xfd, 0x13, 0x31, 0x57, 0x75},
+ {0x0, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x5, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec, 0xa, 0x29, 0x4c, 0x6f, 0x86, 0xa5, 0xc0, 0xe3, 0xf, 0x2c, 0x49, 0x6a, 0x83, 0xa0, 0xc5, 0xe6, 0x14, 0x37, 0x52, 0x71, 0x98, 0xbb, 0xde, 0xfd, 0x11, 0x32, 0x57, 0x74, 0x9d, 0xbe, 0xdb, 0xf8, 0x1e, 0x3d, 0x58, 0x7b, 0x92, 0xb1, 0xd4, 0xf7, 0x1b, 0x38, 0x5d, 0x7e, 0x97, 0xb4, 0xd1, 0xf2, 0x28, 0xb, 0x6e, 0x4d, 0xa4, 0x87, 0xe2, 0xc1, 0x2d, 0xe, 0x6b, 0x48, 0xa1, 0x82, 0xe7, 0xc4, 0x22, 0x1, 0x64, 0x47, 0xae, 0x8d, 0xe8, 0xcb, 0x27, 0x4, 0x61, 0x42, 0xab, 0x88, 0xed, 0xce, 0x3c, 0x1f, 0x7a, 0x59, 0xb0, 0x93, 0xf6, 0xd5, 0x39, 0x1a, 0x7f, 0x5c, 0xb5, 0x96, 0xf3, 0xd0, 0x36, 0x15, 0x70, 0x53, 0xba, 0x99, 0xfc, 0xdf, 0x33, 0x10, 0x75, 0x56, 0xbf, 0x9c, 0xf9, 0xda, 0x50, 0x73, 0x16, 0x35, 0xdc, 0xff, 0x9a, 0xb9, 0x55, 0x76, 0x13, 0x30, 0xd9, 0xfa, 0x9f, 0xbc, 0x5a, 0x79, 0x1c, 0x3f, 0xd6, 0xf5, 0x90, 0xb3, 0x5f, 0x7c, 0x19, 0x3a, 0xd3, 0xf0, 0x95, 0xb6, 0x44, 0x67, 0x2, 0x21, 0xc8, 0xeb, 0x8e, 0xad, 0x41, 0x62, 0x7, 0x24, 0xcd, 0xee, 0x8b, 0xa8, 0x4e, 0x6d, 0x8, 0x2b, 0xc2, 0xe1, 0x84, 0xa7, 0x4b, 0x68, 0xd, 0x2e, 0xc7, 0xe4, 0x81, 0xa2, 0x78, 0x5b, 0x3e, 0x1d, 0xf4, 0xd7, 0xb2, 0x91, 0x7d, 0x5e, 0x3b, 0x18, 0xf1, 0xd2, 0xb7, 0x94, 0x72, 0x51, 0x34, 0x17, 0xfe, 0xdd, 0xb8, 0x9b, 0x77, 0x54, 0x31, 0x12, 0xfb, 0xd8, 0xbd, 0x9e, 0x6c, 0x4f, 0x2a, 0x9, 0xe0, 0xc3, 0xa6, 0x85, 0x69, 0x4a, 0x2f, 0xc, 0xe5, 0xc6, 0xa3, 0x80, 0x66, 0x45, 0x20, 0x3, 0xea, 0xc9, 0xac, 0x8f, 0x63, 0x40, 0x25, 0x6, 0xef, 0xcc, 0xa9, 0x8a},
+ {0x0, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1, 0x7a, 0x5e, 0x32, 0x16, 0xea, 0xce, 0xa2, 0x86, 0x47, 0x63, 0xf, 0x2b, 0xd7, 0xf3, 0x9f, 0xbb, 0xf4, 0xd0, 0xbc, 0x98, 0x64, 0x40, 0x2c, 0x8, 0xc9, 0xed, 0x81, 0xa5, 0x59, 0x7d, 0x11, 0x35, 0x8e, 0xaa, 0xc6, 0xe2, 0x1e, 0x3a, 0x56, 0x72, 0xb3, 0x97, 0xfb, 0xdf, 0x23, 0x7, 0x6b, 0x4f, 0xf5, 0xd1, 0xbd, 0x99, 0x65, 0x41, 0x2d, 0x9, 0xc8, 0xec, 0x80, 0xa4, 0x58, 0x7c, 0x10, 0x34, 0x8f, 0xab, 0xc7, 0xe3, 0x1f, 0x3b, 0x57, 0x73, 0xb2, 0x96, 0xfa, 0xde, 0x22, 0x6, 0x6a, 0x4e, 0x1, 0x25, 0x49, 0x6d, 0x91, 0xb5, 0xd9, 0xfd, 0x3c, 0x18, 0x74, 0x50, 0xac, 0x88, 0xe4, 0xc0, 0x7b, 0x5f, 0x33, 0x17, 0xeb, 0xcf, 0xa3, 0x87, 0x46, 0x62, 0xe, 0x2a, 0xd6, 0xf2, 0x9e, 0xba, 0xf7, 0xd3, 0xbf, 0x9b, 0x67, 0x43, 0x2f, 0xb, 0xca, 0xee, 0x82, 0xa6, 0x5a, 0x7e, 0x12, 0x36, 0x8d, 0xa9, 0xc5, 0xe1, 0x1d, 0x39, 0x55, 0x71, 0xb0, 0x94, 0xf8, 0xdc, 0x20, 0x4, 0x68, 0x4c, 0x3, 0x27, 0x4b, 0x6f, 0x93, 0xb7, 0xdb, 0xff, 0x3e, 0x1a, 0x76, 0x52, 0xae, 0x8a, 0xe6, 0xc2, 0x79, 0x5d, 0x31, 0x15, 0xe9, 0xcd, 0xa1, 0x85, 0x44, 0x60, 0xc, 0x28, 0xd4, 0xf0, 0x9c, 0xb8, 0x2, 0x26, 0x4a, 0x6e, 0x92, 0xb6, 0xda, 0xfe, 0x3f, 0x1b, 0x77, 0x53, 0xaf, 0x8b, 0xe7, 0xc3, 0x78, 0x5c, 0x30, 0x14, 0xe8, 0xcc, 0xa0, 0x84, 0x45, 0x61, 0xd, 0x29, 0xd5, 0xf1, 0x9d, 0xb9, 0xf6, 0xd2, 0xbe, 0x9a, 0x66, 0x42, 0x2e, 0xa, 0xcb, 0xef, 0x83, 0xa7, 0x5b, 0x7f, 0x13, 0x37, 0x8c, 0xa8, 0xc4, 0xe0, 0x1c, 0x38, 0x54, 0x70, 0xb1, 0x95, 0xf9, 0xdd, 0x21, 0x5, 0x69, 0x4d},
+ {0x0, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce, 0x6a, 0x4f, 0x20, 0x5, 0xfe, 0xdb, 0xb4, 0x91, 0x5f, 0x7a, 0x15, 0x30, 0xcb, 0xee, 0x81, 0xa4, 0xd4, 0xf1, 0x9e, 0xbb, 0x40, 0x65, 0xa, 0x2f, 0xe1, 0xc4, 0xab, 0x8e, 0x75, 0x50, 0x3f, 0x1a, 0xbe, 0x9b, 0xf4, 0xd1, 0x2a, 0xf, 0x60, 0x45, 0x8b, 0xae, 0xc1, 0xe4, 0x1f, 0x3a, 0x55, 0x70, 0xb5, 0x90, 0xff, 0xda, 0x21, 0x4, 0x6b, 0x4e, 0x80, 0xa5, 0xca, 0xef, 0x14, 0x31, 0x5e, 0x7b, 0xdf, 0xfa, 0x95, 0xb0, 0x4b, 0x6e, 0x1, 0x24, 0xea, 0xcf, 0xa0, 0x85, 0x7e, 0x5b, 0x34, 0x11, 0x61, 0x44, 0x2b, 0xe, 0xf5, 0xd0, 0xbf, 0x9a, 0x54, 0x71, 0x1e, 0x3b, 0xc0, 0xe5, 0x8a, 0xaf, 0xb, 0x2e, 0x41, 0x64, 0x9f, 0xba, 0xd5, 0xf0, 0x3e, 0x1b, 0x74, 0x51, 0xaa, 0x8f, 0xe0, 0xc5, 0x77, 0x52, 0x3d, 0x18, 0xe3, 0xc6, 0xa9, 0x8c, 0x42, 0x67, 0x8, 0x2d, 0xd6, 0xf3, 0x9c, 0xb9, 0x1d, 0x38, 0x57, 0x72, 0x89, 0xac, 0xc3, 0xe6, 0x28, 0xd, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3, 0xa3, 0x86, 0xe9, 0xcc, 0x37, 0x12, 0x7d, 0x58, 0x96, 0xb3, 0xdc, 0xf9, 0x2, 0x27, 0x48, 0x6d, 0xc9, 0xec, 0x83, 0xa6, 0x5d, 0x78, 0x17, 0x32, 0xfc, 0xd9, 0xb6, 0x93, 0x68, 0x4d, 0x22, 0x7, 0xc2, 0xe7, 0x88, 0xad, 0x56, 0x73, 0x1c, 0x39, 0xf7, 0xd2, 0xbd, 0x98, 0x63, 0x46, 0x29, 0xc, 0xa8, 0x8d, 0xe2, 0xc7, 0x3c, 0x19, 0x76, 0x53, 0x9d, 0xb8, 0xd7, 0xf2, 0x9, 0x2c, 0x43, 0x66, 0x16, 0x33, 0x5c, 0x79, 0x82, 0xa7, 0xc8, 0xed, 0x23, 0x6, 0x69, 0x4c, 0xb7, 0x92, 0xfd, 0xd8, 0x7c, 0x59, 0x36, 0x13, 0xe8, 0xcd, 0xa2, 0x87, 0x49, 0x6c, 0x3, 0x26, 0xdd, 0xf8, 0x97, 0xb2},
+ {0x0, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0xb, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf, 0x5a, 0x7c, 0x16, 0x30, 0xc2, 0xe4, 0x8e, 0xa8, 0x77, 0x51, 0x3b, 0x1d, 0xef, 0xc9, 0xa3, 0x85, 0xb4, 0x92, 0xf8, 0xde, 0x2c, 0xa, 0x60, 0x46, 0x99, 0xbf, 0xd5, 0xf3, 0x1, 0x27, 0x4d, 0x6b, 0xee, 0xc8, 0xa2, 0x84, 0x76, 0x50, 0x3a, 0x1c, 0xc3, 0xe5, 0x8f, 0xa9, 0x5b, 0x7d, 0x17, 0x31, 0x75, 0x53, 0x39, 0x1f, 0xed, 0xcb, 0xa1, 0x87, 0x58, 0x7e, 0x14, 0x32, 0xc0, 0xe6, 0x8c, 0xaa, 0x2f, 0x9, 0x63, 0x45, 0xb7, 0x91, 0xfb, 0xdd, 0x2, 0x24, 0x4e, 0x68, 0x9a, 0xbc, 0xd6, 0xf0, 0xc1, 0xe7, 0x8d, 0xab, 0x59, 0x7f, 0x15, 0x33, 0xec, 0xca, 0xa0, 0x86, 0x74, 0x52, 0x38, 0x1e, 0x9b, 0xbd, 0xd7, 0xf1, 0x3, 0x25, 0x4f, 0x69, 0xb6, 0x90, 0xfa, 0xdc, 0x2e, 0x8, 0x62, 0x44, 0xea, 0xcc, 0xa6, 0x80, 0x72, 0x54, 0x3e, 0x18, 0xc7, 0xe1, 0x8b, 0xad, 0x5f, 0x79, 0x13, 0x35, 0xb0, 0x96, 0xfc, 0xda, 0x28, 0xe, 0x64, 0x42, 0x9d, 0xbb, 0xd1, 0xf7, 0x5, 0x23, 0x49, 0x6f, 0x5e, 0x78, 0x12, 0x34, 0xc6, 0xe0, 0x8a, 0xac, 0x73, 0x55, 0x3f, 0x19, 0xeb, 0xcd, 0xa7, 0x81, 0x4, 0x22, 0x48, 0x6e, 0x9c, 0xba, 0xd0, 0xf6, 0x29, 0xf, 0x65, 0x43, 0xb1, 0x97, 0xfd, 0xdb, 0x9f, 0xb9, 0xd3, 0xf5, 0x7, 0x21, 0x4b, 0x6d, 0xb2, 0x94, 0xfe, 0xd8, 0x2a, 0xc, 0x66, 0x40, 0xc5, 0xe3, 0x89, 0xaf, 0x5d, 0x7b, 0x11, 0x37, 0xe8, 0xce, 0xa4, 0x82, 0x70, 0x56, 0x3c, 0x1a, 0x2b, 0xd, 0x67, 0x41, 0xb3, 0x95, 0xff, 0xd9, 0x6, 0x20, 0x4a, 0x6c, 0x9e, 0xb8, 0xd2, 0xf4, 0x71, 0x57, 0x3d, 0x1b, 0xe9, 0xcf, 0xa5, 0x83, 0x5c, 0x7a, 0x10, 0x36, 0xc4, 0xe2, 0x88, 0xae},
+ {0x0, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x2, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0, 0x4a, 0x6d, 0x4, 0x23, 0xd6, 0xf1, 0x98, 0xbf, 0x6f, 0x48, 0x21, 0x6, 0xf3, 0xd4, 0xbd, 0x9a, 0x94, 0xb3, 0xda, 0xfd, 0x8, 0x2f, 0x46, 0x61, 0xb1, 0x96, 0xff, 0xd8, 0x2d, 0xa, 0x63, 0x44, 0xde, 0xf9, 0x90, 0xb7, 0x42, 0x65, 0xc, 0x2b, 0xfb, 0xdc, 0xb5, 0x92, 0x67, 0x40, 0x29, 0xe, 0x35, 0x12, 0x7b, 0x5c, 0xa9, 0x8e, 0xe7, 0xc0, 0x10, 0x37, 0x5e, 0x79, 0x8c, 0xab, 0xc2, 0xe5, 0x7f, 0x58, 0x31, 0x16, 0xe3, 0xc4, 0xad, 0x8a, 0x5a, 0x7d, 0x14, 0x33, 0xc6, 0xe1, 0x88, 0xaf, 0xa1, 0x86, 0xef, 0xc8, 0x3d, 0x1a, 0x73, 0x54, 0x84, 0xa3, 0xca, 0xed, 0x18, 0x3f, 0x56, 0x71, 0xeb, 0xcc, 0xa5, 0x82, 0x77, 0x50, 0x39, 0x1e, 0xce, 0xe9, 0x80, 0xa7, 0x52, 0x75, 0x1c, 0x3b, 0x6a, 0x4d, 0x24, 0x3, 0xf6, 0xd1, 0xb8, 0x9f, 0x4f, 0x68, 0x1, 0x26, 0xd3, 0xf4, 0x9d, 0xba, 0x20, 0x7, 0x6e, 0x49, 0xbc, 0x9b, 0xf2, 0xd5, 0x5, 0x22, 0x4b, 0x6c, 0x99, 0xbe, 0xd7, 0xf0, 0xfe, 0xd9, 0xb0, 0x97, 0x62, 0x45, 0x2c, 0xb, 0xdb, 0xfc, 0x95, 0xb2, 0x47, 0x60, 0x9, 0x2e, 0xb4, 0x93, 0xfa, 0xdd, 0x28, 0xf, 0x66, 0x41, 0x91, 0xb6, 0xdf, 0xf8, 0xd, 0x2a, 0x43, 0x64, 0x5f, 0x78, 0x11, 0x36, 0xc3, 0xe4, 0x8d, 0xaa, 0x7a, 0x5d, 0x34, 0x13, 0xe6, 0xc1, 0xa8, 0x8f, 0x15, 0x32, 0x5b, 0x7c, 0x89, 0xae, 0xc7, 0xe0, 0x30, 0x17, 0x7e, 0x59, 0xac, 0x8b, 0xe2, 0xc5, 0xcb, 0xec, 0x85, 0xa2, 0x57, 0x70, 0x19, 0x3e, 0xee, 0xc9, 0xa0, 0x87, 0x72, 0x55, 0x3c, 0x1b, 0x81, 0xa6, 0xcf, 0xe8, 0x1d, 0x3a, 0x53, 0x74, 0xa4, 0x83, 0xea, 0xcd, 0x38, 0x1f, 0x76, 0x51},
+ {0x0, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0xd, 0x25, 0xfd, 0xd5, 0xad, 0x85, 0xba, 0x92, 0xea, 0xc2, 0x1a, 0x32, 0x4a, 0x62, 0xe7, 0xcf, 0xb7, 0x9f, 0x47, 0x6f, 0x17, 0x3f, 0x69, 0x41, 0x39, 0x11, 0xc9, 0xe1, 0x99, 0xb1, 0x34, 0x1c, 0x64, 0x4c, 0x94, 0xbc, 0xc4, 0xec, 0xd3, 0xfb, 0x83, 0xab, 0x73, 0x5b, 0x23, 0xb, 0x8e, 0xa6, 0xde, 0xf6, 0x2e, 0x6, 0x7e, 0x56, 0xd2, 0xfa, 0x82, 0xaa, 0x72, 0x5a, 0x22, 0xa, 0x8f, 0xa7, 0xdf, 0xf7, 0x2f, 0x7, 0x7f, 0x57, 0x68, 0x40, 0x38, 0x10, 0xc8, 0xe0, 0x98, 0xb0, 0x35, 0x1d, 0x65, 0x4d, 0x95, 0xbd, 0xc5, 0xed, 0xbb, 0x93, 0xeb, 0xc3, 0x1b, 0x33, 0x4b, 0x63, 0xe6, 0xce, 0xb6, 0x9e, 0x46, 0x6e, 0x16, 0x3e, 0x1, 0x29, 0x51, 0x79, 0xa1, 0x89, 0xf1, 0xd9, 0x5c, 0x74, 0xc, 0x24, 0xfc, 0xd4, 0xac, 0x84, 0xb9, 0x91, 0xe9, 0xc1, 0x19, 0x31, 0x49, 0x61, 0xe4, 0xcc, 0xb4, 0x9c, 0x44, 0x6c, 0x14, 0x3c, 0x3, 0x2b, 0x53, 0x7b, 0xa3, 0x8b, 0xf3, 0xdb, 0x5e, 0x76, 0xe, 0x26, 0xfe, 0xd6, 0xae, 0x86, 0xd0, 0xf8, 0x80, 0xa8, 0x70, 0x58, 0x20, 0x8, 0x8d, 0xa5, 0xdd, 0xf5, 0x2d, 0x5, 0x7d, 0x55, 0x6a, 0x42, 0x3a, 0x12, 0xca, 0xe2, 0x9a, 0xb2, 0x37, 0x1f, 0x67, 0x4f, 0x97, 0xbf, 0xc7, 0xef, 0x6b, 0x43, 0x3b, 0x13, 0xcb, 0xe3, 0x9b, 0xb3, 0x36, 0x1e, 0x66, 0x4e, 0x96, 0xbe, 0xc6, 0xee, 0xd1, 0xf9, 0x81, 0xa9, 0x71, 0x59, 0x21, 0x9, 0x8c, 0xa4, 0xdc, 0xf4, 0x2c, 0x4, 0x7c, 0x54, 0x2, 0x2a, 0x52, 0x7a, 0xa2, 0x8a, 0xf2, 0xda, 0x5f, 0x77, 0xf, 0x27, 0xff, 0xd7, 0xaf, 0x87, 0xb8, 0x90, 0xe8, 0xc0, 0x18, 0x30, 0x48, 0x60, 0xe5, 0xcd, 0xb5, 0x9d, 0x45, 0x6d, 0x15, 0x3d},
+ {0x0, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x7, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a, 0xaa, 0x83, 0xf8, 0xd1, 0xe, 0x27, 0x5c, 0x75, 0xff, 0xd6, 0xad, 0x84, 0x5b, 0x72, 0x9, 0x20, 0x49, 0x60, 0x1b, 0x32, 0xed, 0xc4, 0xbf, 0x96, 0x1c, 0x35, 0x4e, 0x67, 0xb8, 0x91, 0xea, 0xc3, 0xe3, 0xca, 0xb1, 0x98, 0x47, 0x6e, 0x15, 0x3c, 0xb6, 0x9f, 0xe4, 0xcd, 0x12, 0x3b, 0x40, 0x69, 0x92, 0xbb, 0xc0, 0xe9, 0x36, 0x1f, 0x64, 0x4d, 0xc7, 0xee, 0x95, 0xbc, 0x63, 0x4a, 0x31, 0x18, 0x38, 0x11, 0x6a, 0x43, 0x9c, 0xb5, 0xce, 0xe7, 0x6d, 0x44, 0x3f, 0x16, 0xc9, 0xe0, 0x9b, 0xb2, 0xdb, 0xf2, 0x89, 0xa0, 0x7f, 0x56, 0x2d, 0x4, 0x8e, 0xa7, 0xdc, 0xf5, 0x2a, 0x3, 0x78, 0x51, 0x71, 0x58, 0x23, 0xa, 0xd5, 0xfc, 0x87, 0xae, 0x24, 0xd, 0x76, 0x5f, 0x80, 0xa9, 0xd2, 0xfb, 0x39, 0x10, 0x6b, 0x42, 0x9d, 0xb4, 0xcf, 0xe6, 0x6c, 0x45, 0x3e, 0x17, 0xc8, 0xe1, 0x9a, 0xb3, 0x93, 0xba, 0xc1, 0xe8, 0x37, 0x1e, 0x65, 0x4c, 0xc6, 0xef, 0x94, 0xbd, 0x62, 0x4b, 0x30, 0x19, 0x70, 0x59, 0x22, 0xb, 0xd4, 0xfd, 0x86, 0xaf, 0x25, 0xc, 0x77, 0x5e, 0x81, 0xa8, 0xd3, 0xfa, 0xda, 0xf3, 0x88, 0xa1, 0x7e, 0x57, 0x2c, 0x5, 0x8f, 0xa6, 0xdd, 0xf4, 0x2b, 0x2, 0x79, 0x50, 0xab, 0x82, 0xf9, 0xd0, 0xf, 0x26, 0x5d, 0x74, 0xfe, 0xd7, 0xac, 0x85, 0x5a, 0x73, 0x8, 0x21, 0x1, 0x28, 0x53, 0x7a, 0xa5, 0x8c, 0xf7, 0xde, 0x54, 0x7d, 0x6, 0x2f, 0xf0, 0xd9, 0xa2, 0x8b, 0xe2, 0xcb, 0xb0, 0x99, 0x46, 0x6f, 0x14, 0x3d, 0xb7, 0x9e, 0xe5, 0xcc, 0x13, 0x3a, 0x41, 0x68, 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97, 0x1d, 0x34, 0x4f, 0x66, 0xb9, 0x90, 0xeb, 0xc2},
+ {0x0, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b, 0x9a, 0xb0, 0xce, 0xe4, 0x32, 0x18, 0x66, 0x4c, 0xd7, 0xfd, 0x83, 0xa9, 0x7f, 0x55, 0x2b, 0x1, 0x29, 0x3, 0x7d, 0x57, 0x81, 0xab, 0xd5, 0xff, 0x64, 0x4e, 0x30, 0x1a, 0xcc, 0xe6, 0x98, 0xb2, 0xb3, 0x99, 0xe7, 0xcd, 0x1b, 0x31, 0x4f, 0x65, 0xfe, 0xd4, 0xaa, 0x80, 0x56, 0x7c, 0x2, 0x28, 0x52, 0x78, 0x6, 0x2c, 0xfa, 0xd0, 0xae, 0x84, 0x1f, 0x35, 0x4b, 0x61, 0xb7, 0x9d, 0xe3, 0xc9, 0xc8, 0xe2, 0x9c, 0xb6, 0x60, 0x4a, 0x34, 0x1e, 0x85, 0xaf, 0xd1, 0xfb, 0x2d, 0x7, 0x79, 0x53, 0x7b, 0x51, 0x2f, 0x5, 0xd3, 0xf9, 0x87, 0xad, 0x36, 0x1c, 0x62, 0x48, 0x9e, 0xb4, 0xca, 0xe0, 0xe1, 0xcb, 0xb5, 0x9f, 0x49, 0x63, 0x1d, 0x37, 0xac, 0x86, 0xf8, 0xd2, 0x4, 0x2e, 0x50, 0x7a, 0xa4, 0x8e, 0xf0, 0xda, 0xc, 0x26, 0x58, 0x72, 0xe9, 0xc3, 0xbd, 0x97, 0x41, 0x6b, 0x15, 0x3f, 0x3e, 0x14, 0x6a, 0x40, 0x96, 0xbc, 0xc2, 0xe8, 0x73, 0x59, 0x27, 0xd, 0xdb, 0xf1, 0x8f, 0xa5, 0x8d, 0xa7, 0xd9, 0xf3, 0x25, 0xf, 0x71, 0x5b, 0xc0, 0xea, 0x94, 0xbe, 0x68, 0x42, 0x3c, 0x16, 0x17, 0x3d, 0x43, 0x69, 0xbf, 0x95, 0xeb, 0xc1, 0x5a, 0x70, 0xe, 0x24, 0xf2, 0xd8, 0xa6, 0x8c, 0xf6, 0xdc, 0xa2, 0x88, 0x5e, 0x74, 0xa, 0x20, 0xbb, 0x91, 0xef, 0xc5, 0x13, 0x39, 0x47, 0x6d, 0x6c, 0x46, 0x38, 0x12, 0xc4, 0xee, 0x90, 0xba, 0x21, 0xb, 0x75, 0x5f, 0x89, 0xa3, 0xdd, 0xf7, 0xdf, 0xf5, 0x8b, 0xa1, 0x77, 0x5d, 0x23, 0x9, 0x92, 0xb8, 0xc6, 0xec, 0x3a, 0x10, 0x6e, 0x44, 0x45, 0x6f, 0x11, 0x3b, 0xed, 0xc7, 0xb9, 0x93, 0x8, 0x22, 0x5c, 0x76, 0xa0, 0x8a, 0xf4, 0xde},
+ {0x0, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94, 0x8a, 0xa1, 0xdc, 0xf7, 0x26, 0xd, 0x70, 0x5b, 0xcf, 0xe4, 0x99, 0xb2, 0x63, 0x48, 0x35, 0x1e, 0x9, 0x22, 0x5f, 0x74, 0xa5, 0x8e, 0xf3, 0xd8, 0x4c, 0x67, 0x1a, 0x31, 0xe0, 0xcb, 0xb6, 0x9d, 0x83, 0xa8, 0xd5, 0xfe, 0x2f, 0x4, 0x79, 0x52, 0xc6, 0xed, 0x90, 0xbb, 0x6a, 0x41, 0x3c, 0x17, 0x12, 0x39, 0x44, 0x6f, 0xbe, 0x95, 0xe8, 0xc3, 0x57, 0x7c, 0x1, 0x2a, 0xfb, 0xd0, 0xad, 0x86, 0x98, 0xb3, 0xce, 0xe5, 0x34, 0x1f, 0x62, 0x49, 0xdd, 0xf6, 0x8b, 0xa0, 0x71, 0x5a, 0x27, 0xc, 0x1b, 0x30, 0x4d, 0x66, 0xb7, 0x9c, 0xe1, 0xca, 0x5e, 0x75, 0x8, 0x23, 0xf2, 0xd9, 0xa4, 0x8f, 0x91, 0xba, 0xc7, 0xec, 0x3d, 0x16, 0x6b, 0x40, 0xd4, 0xff, 0x82, 0xa9, 0x78, 0x53, 0x2e, 0x5, 0x24, 0xf, 0x72, 0x59, 0x88, 0xa3, 0xde, 0xf5, 0x61, 0x4a, 0x37, 0x1c, 0xcd, 0xe6, 0x9b, 0xb0, 0xae, 0x85, 0xf8, 0xd3, 0x2, 0x29, 0x54, 0x7f, 0xeb, 0xc0, 0xbd, 0x96, 0x47, 0x6c, 0x11, 0x3a, 0x2d, 0x6, 0x7b, 0x50, 0x81, 0xaa, 0xd7, 0xfc, 0x68, 0x43, 0x3e, 0x15, 0xc4, 0xef, 0x92, 0xb9, 0xa7, 0x8c, 0xf1, 0xda, 0xb, 0x20, 0x5d, 0x76, 0xe2, 0xc9, 0xb4, 0x9f, 0x4e, 0x65, 0x18, 0x33, 0x36, 0x1d, 0x60, 0x4b, 0x9a, 0xb1, 0xcc, 0xe7, 0x73, 0x58, 0x25, 0xe, 0xdf, 0xf4, 0x89, 0xa2, 0xbc, 0x97, 0xea, 0xc1, 0x10, 0x3b, 0x46, 0x6d, 0xf9, 0xd2, 0xaf, 0x84, 0x55, 0x7e, 0x3, 0x28, 0x3f, 0x14, 0x69, 0x42, 0x93, 0xb8, 0xc5, 0xee, 0x7a, 0x51, 0x2c, 0x7, 0xd6, 0xfd, 0x80, 0xab, 0xb5, 0x9e, 0xe3, 0xc8, 0x19, 0x32, 0x4f, 0x64, 0xf0, 0xdb, 0xa6, 0x8d, 0x5c, 0x77, 0xa, 0x21},
+ {0x0, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x9, 0xcd, 0xe1, 0x95, 0xb9, 0xfa, 0xd6, 0xa2, 0x8e, 0x4a, 0x66, 0x12, 0x3e, 0x87, 0xab, 0xdf, 0xf3, 0x37, 0x1b, 0x6f, 0x43, 0xe9, 0xc5, 0xb1, 0x9d, 0x59, 0x75, 0x1, 0x2d, 0x94, 0xb8, 0xcc, 0xe0, 0x24, 0x8, 0x7c, 0x50, 0x13, 0x3f, 0x4b, 0x67, 0xa3, 0x8f, 0xfb, 0xd7, 0x6e, 0x42, 0x36, 0x1a, 0xde, 0xf2, 0x86, 0xaa, 0xcf, 0xe3, 0x97, 0xbb, 0x7f, 0x53, 0x27, 0xb, 0xb2, 0x9e, 0xea, 0xc6, 0x2, 0x2e, 0x5a, 0x76, 0x35, 0x19, 0x6d, 0x41, 0x85, 0xa9, 0xdd, 0xf1, 0x48, 0x64, 0x10, 0x3c, 0xf8, 0xd4, 0xa0, 0x8c, 0x26, 0xa, 0x7e, 0x52, 0x96, 0xba, 0xce, 0xe2, 0x5b, 0x77, 0x3, 0x2f, 0xeb, 0xc7, 0xb3, 0x9f, 0xdc, 0xf0, 0x84, 0xa8, 0x6c, 0x40, 0x34, 0x18, 0xa1, 0x8d, 0xf9, 0xd5, 0x11, 0x3d, 0x49, 0x65, 0x83, 0xaf, 0xdb, 0xf7, 0x33, 0x1f, 0x6b, 0x47, 0xfe, 0xd2, 0xa6, 0x8a, 0x4e, 0x62, 0x16, 0x3a, 0x79, 0x55, 0x21, 0xd, 0xc9, 0xe5, 0x91, 0xbd, 0x4, 0x28, 0x5c, 0x70, 0xb4, 0x98, 0xec, 0xc0, 0x6a, 0x46, 0x32, 0x1e, 0xda, 0xf6, 0x82, 0xae, 0x17, 0x3b, 0x4f, 0x63, 0xa7, 0x8b, 0xff, 0xd3, 0x90, 0xbc, 0xc8, 0xe4, 0x20, 0xc, 0x78, 0x54, 0xed, 0xc1, 0xb5, 0x99, 0x5d, 0x71, 0x5, 0x29, 0x4c, 0x60, 0x14, 0x38, 0xfc, 0xd0, 0xa4, 0x88, 0x31, 0x1d, 0x69, 0x45, 0x81, 0xad, 0xd9, 0xf5, 0xb6, 0x9a, 0xee, 0xc2, 0x6, 0x2a, 0x5e, 0x72, 0xcb, 0xe7, 0x93, 0xbf, 0x7b, 0x57, 0x23, 0xf, 0xa5, 0x89, 0xfd, 0xd1, 0x15, 0x39, 0x4d, 0x61, 0xd8, 0xf4, 0x80, 0xac, 0x68, 0x44, 0x30, 0x1c, 0x5f, 0x73, 0x7, 0x2b, 0xef, 0xc3, 0xb7, 0x9b, 0x22, 0xe, 0x7a, 0x56, 0x92, 0xbe, 0xca, 0xe6},
+ {0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x2, 0xc1, 0xec, 0x9b, 0xb6, 0xea, 0xc7, 0xb0, 0x9d, 0x5e, 0x73, 0x4, 0x29, 0x9f, 0xb2, 0xc5, 0xe8, 0x2b, 0x6, 0x71, 0x5c, 0xc9, 0xe4, 0x93, 0xbe, 0x7d, 0x50, 0x27, 0xa, 0xbc, 0x91, 0xe6, 0xcb, 0x8, 0x25, 0x52, 0x7f, 0x23, 0xe, 0x79, 0x54, 0x97, 0xba, 0xcd, 0xe0, 0x56, 0x7b, 0xc, 0x21, 0xe2, 0xcf, 0xb8, 0x95, 0x8f, 0xa2, 0xd5, 0xf8, 0x3b, 0x16, 0x61, 0x4c, 0xfa, 0xd7, 0xa0, 0x8d, 0x4e, 0x63, 0x14, 0x39, 0x65, 0x48, 0x3f, 0x12, 0xd1, 0xfc, 0x8b, 0xa6, 0x10, 0x3d, 0x4a, 0x67, 0xa4, 0x89, 0xfe, 0xd3, 0x46, 0x6b, 0x1c, 0x31, 0xf2, 0xdf, 0xa8, 0x85, 0x33, 0x1e, 0x69, 0x44, 0x87, 0xaa, 0xdd, 0xf0, 0xac, 0x81, 0xf6, 0xdb, 0x18, 0x35, 0x42, 0x6f, 0xd9, 0xf4, 0x83, 0xae, 0x6d, 0x40, 0x37, 0x1a, 0x3, 0x2e, 0x59, 0x74, 0xb7, 0x9a, 0xed, 0xc0, 0x76, 0x5b, 0x2c, 0x1, 0xc2, 0xef, 0x98, 0xb5, 0xe9, 0xc4, 0xb3, 0x9e, 0x5d, 0x70, 0x7, 0x2a, 0x9c, 0xb1, 0xc6, 0xeb, 0x28, 0x5, 0x72, 0x5f, 0xca, 0xe7, 0x90, 0xbd, 0x7e, 0x53, 0x24, 0x9, 0xbf, 0x92, 0xe5, 0xc8, 0xb, 0x26, 0x51, 0x7c, 0x20, 0xd, 0x7a, 0x57, 0x94, 0xb9, 0xce, 0xe3, 0x55, 0x78, 0xf, 0x22, 0xe1, 0xcc, 0xbb, 0x96, 0x8c, 0xa1, 0xd6, 0xfb, 0x38, 0x15, 0x62, 0x4f, 0xf9, 0xd4, 0xa3, 0x8e, 0x4d, 0x60, 0x17, 0x3a, 0x66, 0x4b, 0x3c, 0x11, 0xd2, 0xff, 0x88, 0xa5, 0x13, 0x3e, 0x49, 0x64, 0xa7, 0x8a, 0xfd, 0xd0, 0x45, 0x68, 0x1f, 0x32, 0xf1, 0xdc, 0xab, 0x86, 0x30, 0x1d, 0x6a, 0x47, 0x84, 0xa9, 0xde, 0xf3, 0xaf, 0x82, 0xf5, 0xd8, 0x1b, 0x36, 0x41, 0x6c, 0xda, 0xf7, 0x80, 0xad, 0x6e, 0x43, 0x34, 0x19},
+ {0x0, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7, 0xda, 0xf4, 0x86, 0xa8, 0x62, 0x4c, 0x3e, 0x10, 0xb7, 0x99, 0xeb, 0xc5, 0xf, 0x21, 0x53, 0x7d, 0xa9, 0x87, 0xf5, 0xdb, 0x11, 0x3f, 0x4d, 0x63, 0xc4, 0xea, 0x98, 0xb6, 0x7c, 0x52, 0x20, 0xe, 0x73, 0x5d, 0x2f, 0x1, 0xcb, 0xe5, 0x97, 0xb9, 0x1e, 0x30, 0x42, 0x6c, 0xa6, 0x88, 0xfa, 0xd4, 0x4f, 0x61, 0x13, 0x3d, 0xf7, 0xd9, 0xab, 0x85, 0x22, 0xc, 0x7e, 0x50, 0x9a, 0xb4, 0xc6, 0xe8, 0x95, 0xbb, 0xc9, 0xe7, 0x2d, 0x3, 0x71, 0x5f, 0xf8, 0xd6, 0xa4, 0x8a, 0x40, 0x6e, 0x1c, 0x32, 0xe6, 0xc8, 0xba, 0x94, 0x5e, 0x70, 0x2, 0x2c, 0x8b, 0xa5, 0xd7, 0xf9, 0x33, 0x1d, 0x6f, 0x41, 0x3c, 0x12, 0x60, 0x4e, 0x84, 0xaa, 0xd8, 0xf6, 0x51, 0x7f, 0xd, 0x23, 0xe9, 0xc7, 0xb5, 0x9b, 0x9e, 0xb0, 0xc2, 0xec, 0x26, 0x8, 0x7a, 0x54, 0xf3, 0xdd, 0xaf, 0x81, 0x4b, 0x65, 0x17, 0x39, 0x44, 0x6a, 0x18, 0x36, 0xfc, 0xd2, 0xa0, 0x8e, 0x29, 0x7, 0x75, 0x5b, 0x91, 0xbf, 0xcd, 0xe3, 0x37, 0x19, 0x6b, 0x45, 0x8f, 0xa1, 0xd3, 0xfd, 0x5a, 0x74, 0x6, 0x28, 0xe2, 0xcc, 0xbe, 0x90, 0xed, 0xc3, 0xb1, 0x9f, 0x55, 0x7b, 0x9, 0x27, 0x80, 0xae, 0xdc, 0xf2, 0x38, 0x16, 0x64, 0x4a, 0xd1, 0xff, 0x8d, 0xa3, 0x69, 0x47, 0x35, 0x1b, 0xbc, 0x92, 0xe0, 0xce, 0x4, 0x2a, 0x58, 0x76, 0xb, 0x25, 0x57, 0x79, 0xb3, 0x9d, 0xef, 0xc1, 0x66, 0x48, 0x3a, 0x14, 0xde, 0xf0, 0x82, 0xac, 0x78, 0x56, 0x24, 0xa, 0xc0, 0xee, 0x9c, 0xb2, 0x15, 0x3b, 0x49, 0x67, 0xad, 0x83, 0xf1, 0xdf, 0xa2, 0x8c, 0xfe, 0xd0, 0x1a, 0x34, 0x46, 0x68, 0xcf, 0xe1, 0x93, 0xbd, 0x77, 0x59, 0x2b, 0x5},
+ {0x0, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8, 0xca, 0xe5, 0x94, 0xbb, 0x76, 0x59, 0x28, 0x7, 0xaf, 0x80, 0xf1, 0xde, 0x13, 0x3c, 0x4d, 0x62, 0x89, 0xa6, 0xd7, 0xf8, 0x35, 0x1a, 0x6b, 0x44, 0xec, 0xc3, 0xb2, 0x9d, 0x50, 0x7f, 0xe, 0x21, 0x43, 0x6c, 0x1d, 0x32, 0xff, 0xd0, 0xa1, 0x8e, 0x26, 0x9, 0x78, 0x57, 0x9a, 0xb5, 0xc4, 0xeb, 0xf, 0x20, 0x51, 0x7e, 0xb3, 0x9c, 0xed, 0xc2, 0x6a, 0x45, 0x34, 0x1b, 0xd6, 0xf9, 0x88, 0xa7, 0xc5, 0xea, 0x9b, 0xb4, 0x79, 0x56, 0x27, 0x8, 0xa0, 0x8f, 0xfe, 0xd1, 0x1c, 0x33, 0x42, 0x6d, 0x86, 0xa9, 0xd8, 0xf7, 0x3a, 0x15, 0x64, 0x4b, 0xe3, 0xcc, 0xbd, 0x92, 0x5f, 0x70, 0x1, 0x2e, 0x4c, 0x63, 0x12, 0x3d, 0xf0, 0xdf, 0xae, 0x81, 0x29, 0x6, 0x77, 0x58, 0x95, 0xba, 0xcb, 0xe4, 0x1e, 0x31, 0x40, 0x6f, 0xa2, 0x8d, 0xfc, 0xd3, 0x7b, 0x54, 0x25, 0xa, 0xc7, 0xe8, 0x99, 0xb6, 0xd4, 0xfb, 0x8a, 0xa5, 0x68, 0x47, 0x36, 0x19, 0xb1, 0x9e, 0xef, 0xc0, 0xd, 0x22, 0x53, 0x7c, 0x97, 0xb8, 0xc9, 0xe6, 0x2b, 0x4, 0x75, 0x5a, 0xf2, 0xdd, 0xac, 0x83, 0x4e, 0x61, 0x10, 0x3f, 0x5d, 0x72, 0x3, 0x2c, 0xe1, 0xce, 0xbf, 0x90, 0x38, 0x17, 0x66, 0x49, 0x84, 0xab, 0xda, 0xf5, 0x11, 0x3e, 0x4f, 0x60, 0xad, 0x82, 0xf3, 0xdc, 0x74, 0x5b, 0x2a, 0x5, 0xc8, 0xe7, 0x96, 0xb9, 0xdb, 0xf4, 0x85, 0xaa, 0x67, 0x48, 0x39, 0x16, 0xbe, 0x91, 0xe0, 0xcf, 0x2, 0x2d, 0x5c, 0x73, 0x98, 0xb7, 0xc6, 0xe9, 0x24, 0xb, 0x7a, 0x55, 0xfd, 0xd2, 0xa3, 0x8c, 0x41, 0x6e, 0x1f, 0x30, 0x52, 0x7d, 0xc, 0x23, 0xee, 0xc1, 0xb0, 0x9f, 0x37, 0x18, 0x69, 0x46, 0x8b, 0xa4, 0xd5, 0xfa},
+ {0x0, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0xd, 0x27, 0x17, 0x47, 0x77, 0xe7, 0xd7, 0x87, 0xb7, 0xba, 0x8a, 0xda, 0xea, 0x7a, 0x4a, 0x1a, 0x2a, 0x4e, 0x7e, 0x2e, 0x1e, 0x8e, 0xbe, 0xee, 0xde, 0xd3, 0xe3, 0xb3, 0x83, 0x13, 0x23, 0x73, 0x43, 0x69, 0x59, 0x9, 0x39, 0xa9, 0x99, 0xc9, 0xf9, 0xf4, 0xc4, 0x94, 0xa4, 0x34, 0x4, 0x54, 0x64, 0x9c, 0xac, 0xfc, 0xcc, 0x5c, 0x6c, 0x3c, 0xc, 0x1, 0x31, 0x61, 0x51, 0xc1, 0xf1, 0xa1, 0x91, 0xbb, 0x8b, 0xdb, 0xeb, 0x7b, 0x4b, 0x1b, 0x2b, 0x26, 0x16, 0x46, 0x76, 0xe6, 0xd6, 0x86, 0xb6, 0xd2, 0xe2, 0xb2, 0x82, 0x12, 0x22, 0x72, 0x42, 0x4f, 0x7f, 0x2f, 0x1f, 0x8f, 0xbf, 0xef, 0xdf, 0xf5, 0xc5, 0x95, 0xa5, 0x35, 0x5, 0x55, 0x65, 0x68, 0x58, 0x8, 0x38, 0xa8, 0x98, 0xc8, 0xf8, 0x25, 0x15, 0x45, 0x75, 0xe5, 0xd5, 0x85, 0xb5, 0xb8, 0x88, 0xd8, 0xe8, 0x78, 0x48, 0x18, 0x28, 0x2, 0x32, 0x62, 0x52, 0xc2, 0xf2, 0xa2, 0x92, 0x9f, 0xaf, 0xff, 0xcf, 0x5f, 0x6f, 0x3f, 0xf, 0x6b, 0x5b, 0xb, 0x3b, 0xab, 0x9b, 0xcb, 0xfb, 0xf6, 0xc6, 0x96, 0xa6, 0x36, 0x6, 0x56, 0x66, 0x4c, 0x7c, 0x2c, 0x1c, 0x8c, 0xbc, 0xec, 0xdc, 0xd1, 0xe1, 0xb1, 0x81, 0x11, 0x21, 0x71, 0x41, 0xb9, 0x89, 0xd9, 0xe9, 0x79, 0x49, 0x19, 0x29, 0x24, 0x14, 0x44, 0x74, 0xe4, 0xd4, 0x84, 0xb4, 0x9e, 0xae, 0xfe, 0xce, 0x5e, 0x6e, 0x3e, 0xe, 0x3, 0x33, 0x63, 0x53, 0xc3, 0xf3, 0xa3, 0x93, 0xf7, 0xc7, 0x97, 0xa7, 0x37, 0x7, 0x57, 0x67, 0x6a, 0x5a, 0xa, 0x3a, 0xaa, 0x9a, 0xca, 0xfa, 0xd0, 0xe0, 0xb0, 0x80, 0x10, 0x20, 0x70, 0x40, 0x4d, 0x7d, 0x2d, 0x1d, 0x8d, 0xbd, 0xed, 0xdd},
+ {0x0, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x2, 0x37, 0x6, 0x55, 0x64, 0xf3, 0xc2, 0x91, 0xa0, 0xa2, 0x93, 0xc0, 0xf1, 0x66, 0x57, 0x4, 0x35, 0x6e, 0x5f, 0xc, 0x3d, 0xaa, 0x9b, 0xc8, 0xf9, 0xfb, 0xca, 0x99, 0xa8, 0x3f, 0xe, 0x5d, 0x6c, 0x59, 0x68, 0x3b, 0xa, 0x9d, 0xac, 0xff, 0xce, 0xcc, 0xfd, 0xae, 0x9f, 0x8, 0x39, 0x6a, 0x5b, 0xdc, 0xed, 0xbe, 0x8f, 0x18, 0x29, 0x7a, 0x4b, 0x49, 0x78, 0x2b, 0x1a, 0x8d, 0xbc, 0xef, 0xde, 0xeb, 0xda, 0x89, 0xb8, 0x2f, 0x1e, 0x4d, 0x7c, 0x7e, 0x4f, 0x1c, 0x2d, 0xba, 0x8b, 0xd8, 0xe9, 0xb2, 0x83, 0xd0, 0xe1, 0x76, 0x47, 0x14, 0x25, 0x27, 0x16, 0x45, 0x74, 0xe3, 0xd2, 0x81, 0xb0, 0x85, 0xb4, 0xe7, 0xd6, 0x41, 0x70, 0x23, 0x12, 0x10, 0x21, 0x72, 0x43, 0xd4, 0xe5, 0xb6, 0x87, 0xa5, 0x94, 0xc7, 0xf6, 0x61, 0x50, 0x3, 0x32, 0x30, 0x1, 0x52, 0x63, 0xf4, 0xc5, 0x96, 0xa7, 0x92, 0xa3, 0xf0, 0xc1, 0x56, 0x67, 0x34, 0x5, 0x7, 0x36, 0x65, 0x54, 0xc3, 0xf2, 0xa1, 0x90, 0xcb, 0xfa, 0xa9, 0x98, 0xf, 0x3e, 0x6d, 0x5c, 0x5e, 0x6f, 0x3c, 0xd, 0x9a, 0xab, 0xf8, 0xc9, 0xfc, 0xcd, 0x9e, 0xaf, 0x38, 0x9, 0x5a, 0x6b, 0x69, 0x58, 0xb, 0x3a, 0xad, 0x9c, 0xcf, 0xfe, 0x79, 0x48, 0x1b, 0x2a, 0xbd, 0x8c, 0xdf, 0xee, 0xec, 0xdd, 0x8e, 0xbf, 0x28, 0x19, 0x4a, 0x7b, 0x4e, 0x7f, 0x2c, 0x1d, 0x8a, 0xbb, 0xe8, 0xd9, 0xdb, 0xea, 0xb9, 0x88, 0x1f, 0x2e, 0x7d, 0x4c, 0x17, 0x26, 0x75, 0x44, 0xd3, 0xe2, 0xb1, 0x80, 0x82, 0xb3, 0xe0, 0xd1, 0x46, 0x77, 0x24, 0x15, 0x20, 0x11, 0x42, 0x73, 0xe4, 0xd5, 0x86, 0xb7, 0xb5, 0x84, 0xd7, 0xe6, 0x71, 0x40, 0x13, 0x22},
+ {0x0, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13, 0x7, 0x35, 0x63, 0x51, 0xcf, 0xfd, 0xab, 0x99, 0x8a, 0xb8, 0xee, 0xdc, 0x42, 0x70, 0x26, 0x14, 0xe, 0x3c, 0x6a, 0x58, 0xc6, 0xf4, 0xa2, 0x90, 0x83, 0xb1, 0xe7, 0xd5, 0x4b, 0x79, 0x2f, 0x1d, 0x9, 0x3b, 0x6d, 0x5f, 0xc1, 0xf3, 0xa5, 0x97, 0x84, 0xb6, 0xe0, 0xd2, 0x4c, 0x7e, 0x28, 0x1a, 0x1c, 0x2e, 0x78, 0x4a, 0xd4, 0xe6, 0xb0, 0x82, 0x91, 0xa3, 0xf5, 0xc7, 0x59, 0x6b, 0x3d, 0xf, 0x1b, 0x29, 0x7f, 0x4d, 0xd3, 0xe1, 0xb7, 0x85, 0x96, 0xa4, 0xf2, 0xc0, 0x5e, 0x6c, 0x3a, 0x8, 0x12, 0x20, 0x76, 0x44, 0xda, 0xe8, 0xbe, 0x8c, 0x9f, 0xad, 0xfb, 0xc9, 0x57, 0x65, 0x33, 0x1, 0x15, 0x27, 0x71, 0x43, 0xdd, 0xef, 0xb9, 0x8b, 0x98, 0xaa, 0xfc, 0xce, 0x50, 0x62, 0x34, 0x6, 0x38, 0xa, 0x5c, 0x6e, 0xf0, 0xc2, 0x94, 0xa6, 0xb5, 0x87, 0xd1, 0xe3, 0x7d, 0x4f, 0x19, 0x2b, 0x3f, 0xd, 0x5b, 0x69, 0xf7, 0xc5, 0x93, 0xa1, 0xb2, 0x80, 0xd6, 0xe4, 0x7a, 0x48, 0x1e, 0x2c, 0x36, 0x4, 0x52, 0x60, 0xfe, 0xcc, 0x9a, 0xa8, 0xbb, 0x89, 0xdf, 0xed, 0x73, 0x41, 0x17, 0x25, 0x31, 0x3, 0x55, 0x67, 0xf9, 0xcb, 0x9d, 0xaf, 0xbc, 0x8e, 0xd8, 0xea, 0x74, 0x46, 0x10, 0x22, 0x24, 0x16, 0x40, 0x72, 0xec, 0xde, 0x88, 0xba, 0xa9, 0x9b, 0xcd, 0xff, 0x61, 0x53, 0x5, 0x37, 0x23, 0x11, 0x47, 0x75, 0xeb, 0xd9, 0x8f, 0xbd, 0xae, 0x9c, 0xca, 0xf8, 0x66, 0x54, 0x2, 0x30, 0x2a, 0x18, 0x4e, 0x7c, 0xe2, 0xd0, 0x86, 0xb4, 0xa7, 0x95, 0xc3, 0xf1, 0x6f, 0x5d, 0xb, 0x39, 0x2d, 0x1f, 0x49, 0x7b, 0xe5, 0xd7, 0x81, 0xb3, 0xa0, 0x92, 0xc4, 0xf6, 0x68, 0x5a, 0xc, 0x3e},
+ {0x0, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c, 0x17, 0x24, 0x71, 0x42, 0xdb, 0xe8, 0xbd, 0x8e, 0x92, 0xa1, 0xf4, 0xc7, 0x5e, 0x6d, 0x38, 0xb, 0x2e, 0x1d, 0x48, 0x7b, 0xe2, 0xd1, 0x84, 0xb7, 0xab, 0x98, 0xcd, 0xfe, 0x67, 0x54, 0x1, 0x32, 0x39, 0xa, 0x5f, 0x6c, 0xf5, 0xc6, 0x93, 0xa0, 0xbc, 0x8f, 0xda, 0xe9, 0x70, 0x43, 0x16, 0x25, 0x5c, 0x6f, 0x3a, 0x9, 0x90, 0xa3, 0xf6, 0xc5, 0xd9, 0xea, 0xbf, 0x8c, 0x15, 0x26, 0x73, 0x40, 0x4b, 0x78, 0x2d, 0x1e, 0x87, 0xb4, 0xe1, 0xd2, 0xce, 0xfd, 0xa8, 0x9b, 0x2, 0x31, 0x64, 0x57, 0x72, 0x41, 0x14, 0x27, 0xbe, 0x8d, 0xd8, 0xeb, 0xf7, 0xc4, 0x91, 0xa2, 0x3b, 0x8, 0x5d, 0x6e, 0x65, 0x56, 0x3, 0x30, 0xa9, 0x9a, 0xcf, 0xfc, 0xe0, 0xd3, 0x86, 0xb5, 0x2c, 0x1f, 0x4a, 0x79, 0xb8, 0x8b, 0xde, 0xed, 0x74, 0x47, 0x12, 0x21, 0x3d, 0xe, 0x5b, 0x68, 0xf1, 0xc2, 0x97, 0xa4, 0xaf, 0x9c, 0xc9, 0xfa, 0x63, 0x50, 0x5, 0x36, 0x2a, 0x19, 0x4c, 0x7f, 0xe6, 0xd5, 0x80, 0xb3, 0x96, 0xa5, 0xf0, 0xc3, 0x5a, 0x69, 0x3c, 0xf, 0x13, 0x20, 0x75, 0x46, 0xdf, 0xec, 0xb9, 0x8a, 0x81, 0xb2, 0xe7, 0xd4, 0x4d, 0x7e, 0x2b, 0x18, 0x4, 0x37, 0x62, 0x51, 0xc8, 0xfb, 0xae, 0x9d, 0xe4, 0xd7, 0x82, 0xb1, 0x28, 0x1b, 0x4e, 0x7d, 0x61, 0x52, 0x7, 0x34, 0xad, 0x9e, 0xcb, 0xf8, 0xf3, 0xc0, 0x95, 0xa6, 0x3f, 0xc, 0x59, 0x6a, 0x76, 0x45, 0x10, 0x23, 0xba, 0x89, 0xdc, 0xef, 0xca, 0xf9, 0xac, 0x9f, 0x6, 0x35, 0x60, 0x53, 0x4f, 0x7c, 0x29, 0x1a, 0x83, 0xb0, 0xe5, 0xd6, 0xdd, 0xee, 0xbb, 0x88, 0x11, 0x22, 0x77, 0x44, 0x58, 0x6b, 0x3e, 0xd, 0x94, 0xa7, 0xf2, 0xc1},
+ {0x0, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x5, 0x31, 0x67, 0x53, 0xf, 0x3b, 0xb7, 0x83, 0xdf, 0xeb, 0xda, 0xee, 0xb2, 0x86, 0xa, 0x3e, 0x62, 0x56, 0xce, 0xfa, 0xa6, 0x92, 0x1e, 0x2a, 0x76, 0x42, 0x73, 0x47, 0x1b, 0x2f, 0xa3, 0x97, 0xcb, 0xff, 0xa9, 0x9d, 0xc1, 0xf5, 0x79, 0x4d, 0x11, 0x25, 0x14, 0x20, 0x7c, 0x48, 0xc4, 0xf0, 0xac, 0x98, 0x81, 0xb5, 0xe9, 0xdd, 0x51, 0x65, 0x39, 0xd, 0x3c, 0x8, 0x54, 0x60, 0xec, 0xd8, 0x84, 0xb0, 0xe6, 0xd2, 0x8e, 0xba, 0x36, 0x2, 0x5e, 0x6a, 0x5b, 0x6f, 0x33, 0x7, 0x8b, 0xbf, 0xe3, 0xd7, 0x4f, 0x7b, 0x27, 0x13, 0x9f, 0xab, 0xf7, 0xc3, 0xf2, 0xc6, 0x9a, 0xae, 0x22, 0x16, 0x4a, 0x7e, 0x28, 0x1c, 0x40, 0x74, 0xf8, 0xcc, 0x90, 0xa4, 0x95, 0xa1, 0xfd, 0xc9, 0x45, 0x71, 0x2d, 0x19, 0x1f, 0x2b, 0x77, 0x43, 0xcf, 0xfb, 0xa7, 0x93, 0xa2, 0x96, 0xca, 0xfe, 0x72, 0x46, 0x1a, 0x2e, 0x78, 0x4c, 0x10, 0x24, 0xa8, 0x9c, 0xc0, 0xf4, 0xc5, 0xf1, 0xad, 0x99, 0x15, 0x21, 0x7d, 0x49, 0xd1, 0xe5, 0xb9, 0x8d, 0x1, 0x35, 0x69, 0x5d, 0x6c, 0x58, 0x4, 0x30, 0xbc, 0x88, 0xd4, 0xe0, 0xb6, 0x82, 0xde, 0xea, 0x66, 0x52, 0xe, 0x3a, 0xb, 0x3f, 0x63, 0x57, 0xdb, 0xef, 0xb3, 0x87, 0x9e, 0xaa, 0xf6, 0xc2, 0x4e, 0x7a, 0x26, 0x12, 0x23, 0x17, 0x4b, 0x7f, 0xf3, 0xc7, 0x9b, 0xaf, 0xf9, 0xcd, 0x91, 0xa5, 0x29, 0x1d, 0x41, 0x75, 0x44, 0x70, 0x2c, 0x18, 0x94, 0xa0, 0xfc, 0xc8, 0x50, 0x64, 0x38, 0xc, 0x80, 0xb4, 0xe8, 0xdc, 0xed, 0xd9, 0x85, 0xb1, 0x3d, 0x9, 0x55, 0x61, 0x37, 0x3, 0x5f, 0x6b, 0xe7, 0xd3, 0x8f, 0xbb, 0x8a, 0xbe, 0xe2, 0xd6, 0x5a, 0x6e, 0x32, 0x6},
+ {0x0, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0xb, 0x3e, 0x77, 0x42, 0x1d, 0x28, 0xa3, 0x96, 0xc9, 0xfc, 0xc2, 0xf7, 0xa8, 0x9d, 0x16, 0x23, 0x7c, 0x49, 0xee, 0xdb, 0x84, 0xb1, 0x3a, 0xf, 0x50, 0x65, 0x5b, 0x6e, 0x31, 0x4, 0x8f, 0xba, 0xe5, 0xd0, 0x99, 0xac, 0xf3, 0xc6, 0x4d, 0x78, 0x27, 0x12, 0x2c, 0x19, 0x46, 0x73, 0xf8, 0xcd, 0x92, 0xa7, 0xc1, 0xf4, 0xab, 0x9e, 0x15, 0x20, 0x7f, 0x4a, 0x74, 0x41, 0x1e, 0x2b, 0xa0, 0x95, 0xca, 0xff, 0xb6, 0x83, 0xdc, 0xe9, 0x62, 0x57, 0x8, 0x3d, 0x3, 0x36, 0x69, 0x5c, 0xd7, 0xe2, 0xbd, 0x88, 0x2f, 0x1a, 0x45, 0x70, 0xfb, 0xce, 0x91, 0xa4, 0x9a, 0xaf, 0xf0, 0xc5, 0x4e, 0x7b, 0x24, 0x11, 0x58, 0x6d, 0x32, 0x7, 0x8c, 0xb9, 0xe6, 0xd3, 0xed, 0xd8, 0x87, 0xb2, 0x39, 0xc, 0x53, 0x66, 0x9f, 0xaa, 0xf5, 0xc0, 0x4b, 0x7e, 0x21, 0x14, 0x2a, 0x1f, 0x40, 0x75, 0xfe, 0xcb, 0x94, 0xa1, 0xe8, 0xdd, 0x82, 0xb7, 0x3c, 0x9, 0x56, 0x63, 0x5d, 0x68, 0x37, 0x2, 0x89, 0xbc, 0xe3, 0xd6, 0x71, 0x44, 0x1b, 0x2e, 0xa5, 0x90, 0xcf, 0xfa, 0xc4, 0xf1, 0xae, 0x9b, 0x10, 0x25, 0x7a, 0x4f, 0x6, 0x33, 0x6c, 0x59, 0xd2, 0xe7, 0xb8, 0x8d, 0xb3, 0x86, 0xd9, 0xec, 0x67, 0x52, 0xd, 0x38, 0x5e, 0x6b, 0x34, 0x1, 0x8a, 0xbf, 0xe0, 0xd5, 0xeb, 0xde, 0x81, 0xb4, 0x3f, 0xa, 0x55, 0x60, 0x29, 0x1c, 0x43, 0x76, 0xfd, 0xc8, 0x97, 0xa2, 0x9c, 0xa9, 0xf6, 0xc3, 0x48, 0x7d, 0x22, 0x17, 0xb0, 0x85, 0xda, 0xef, 0x64, 0x51, 0xe, 0x3b, 0x5, 0x30, 0x6f, 0x5a, 0xd1, 0xe4, 0xbb, 0x8e, 0xc7, 0xf2, 0xad, 0x98, 0x13, 0x26, 0x79, 0x4c, 0x72, 0x47, 0x18, 0x2d, 0xa6, 0x93, 0xcc, 0xf9},
+ {0x0, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f, 0x47, 0x71, 0x2b, 0x1d, 0x9f, 0xa9, 0xf3, 0xc5, 0xea, 0xdc, 0x86, 0xb0, 0x32, 0x4, 0x5e, 0x68, 0x8e, 0xb8, 0xe2, 0xd4, 0x56, 0x60, 0x3a, 0xc, 0x23, 0x15, 0x4f, 0x79, 0xfb, 0xcd, 0x97, 0xa1, 0xc9, 0xff, 0xa5, 0x93, 0x11, 0x27, 0x7d, 0x4b, 0x64, 0x52, 0x8, 0x3e, 0xbc, 0x8a, 0xd0, 0xe6, 0x1, 0x37, 0x6d, 0x5b, 0xd9, 0xef, 0xb5, 0x83, 0xac, 0x9a, 0xc0, 0xf6, 0x74, 0x42, 0x18, 0x2e, 0x46, 0x70, 0x2a, 0x1c, 0x9e, 0xa8, 0xf2, 0xc4, 0xeb, 0xdd, 0x87, 0xb1, 0x33, 0x5, 0x5f, 0x69, 0x8f, 0xb9, 0xe3, 0xd5, 0x57, 0x61, 0x3b, 0xd, 0x22, 0x14, 0x4e, 0x78, 0xfa, 0xcc, 0x96, 0xa0, 0xc8, 0xfe, 0xa4, 0x92, 0x10, 0x26, 0x7c, 0x4a, 0x65, 0x53, 0x9, 0x3f, 0xbd, 0x8b, 0xd1, 0xe7, 0x2, 0x34, 0x6e, 0x58, 0xda, 0xec, 0xb6, 0x80, 0xaf, 0x99, 0xc3, 0xf5, 0x77, 0x41, 0x1b, 0x2d, 0x45, 0x73, 0x29, 0x1f, 0x9d, 0xab, 0xf1, 0xc7, 0xe8, 0xde, 0x84, 0xb2, 0x30, 0x6, 0x5c, 0x6a, 0x8c, 0xba, 0xe0, 0xd6, 0x54, 0x62, 0x38, 0xe, 0x21, 0x17, 0x4d, 0x7b, 0xf9, 0xcf, 0x95, 0xa3, 0xcb, 0xfd, 0xa7, 0x91, 0x13, 0x25, 0x7f, 0x49, 0x66, 0x50, 0xa, 0x3c, 0xbe, 0x88, 0xd2, 0xe4, 0x3, 0x35, 0x6f, 0x59, 0xdb, 0xed, 0xb7, 0x81, 0xae, 0x98, 0xc2, 0xf4, 0x76, 0x40, 0x1a, 0x2c, 0x44, 0x72, 0x28, 0x1e, 0x9c, 0xaa, 0xf0, 0xc6, 0xe9, 0xdf, 0x85, 0xb3, 0x31, 0x7, 0x5d, 0x6b, 0x8d, 0xbb, 0xe1, 0xd7, 0x55, 0x63, 0x39, 0xf, 0x20, 0x16, 0x4c, 0x7a, 0xf8, 0xce, 0x94, 0xa2, 0xca, 0xfc, 0xa6, 0x90, 0x12, 0x24, 0x7e, 0x48, 0x67, 0x51, 0xb, 0x3d, 0xbf, 0x89, 0xd3, 0xe5},
+ {0x0, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20, 0x57, 0x60, 0x39, 0xe, 0x8b, 0xbc, 0xe5, 0xd2, 0xf2, 0xc5, 0x9c, 0xab, 0x2e, 0x19, 0x40, 0x77, 0xae, 0x99, 0xc0, 0xf7, 0x72, 0x45, 0x1c, 0x2b, 0xb, 0x3c, 0x65, 0x52, 0xd7, 0xe0, 0xb9, 0x8e, 0xf9, 0xce, 0x97, 0xa0, 0x25, 0x12, 0x4b, 0x7c, 0x5c, 0x6b, 0x32, 0x5, 0x80, 0xb7, 0xee, 0xd9, 0x41, 0x76, 0x2f, 0x18, 0x9d, 0xaa, 0xf3, 0xc4, 0xe4, 0xd3, 0x8a, 0xbd, 0x38, 0xf, 0x56, 0x61, 0x16, 0x21, 0x78, 0x4f, 0xca, 0xfd, 0xa4, 0x93, 0xb3, 0x84, 0xdd, 0xea, 0x6f, 0x58, 0x1, 0x36, 0xef, 0xd8, 0x81, 0xb6, 0x33, 0x4, 0x5d, 0x6a, 0x4a, 0x7d, 0x24, 0x13, 0x96, 0xa1, 0xf8, 0xcf, 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0xa, 0x3d, 0x1d, 0x2a, 0x73, 0x44, 0xc1, 0xf6, 0xaf, 0x98, 0x82, 0xb5, 0xec, 0xdb, 0x5e, 0x69, 0x30, 0x7, 0x27, 0x10, 0x49, 0x7e, 0xfb, 0xcc, 0x95, 0xa2, 0xd5, 0xe2, 0xbb, 0x8c, 0x9, 0x3e, 0x67, 0x50, 0x70, 0x47, 0x1e, 0x29, 0xac, 0x9b, 0xc2, 0xf5, 0x2c, 0x1b, 0x42, 0x75, 0xf0, 0xc7, 0x9e, 0xa9, 0x89, 0xbe, 0xe7, 0xd0, 0x55, 0x62, 0x3b, 0xc, 0x7b, 0x4c, 0x15, 0x22, 0xa7, 0x90, 0xc9, 0xfe, 0xde, 0xe9, 0xb0, 0x87, 0x2, 0x35, 0x6c, 0x5b, 0xc3, 0xf4, 0xad, 0x9a, 0x1f, 0x28, 0x71, 0x46, 0x66, 0x51, 0x8, 0x3f, 0xba, 0x8d, 0xd4, 0xe3, 0x94, 0xa3, 0xfa, 0xcd, 0x48, 0x7f, 0x26, 0x11, 0x31, 0x6, 0x5f, 0x68, 0xed, 0xda, 0x83, 0xb4, 0x6d, 0x5a, 0x3, 0x34, 0xb1, 0x86, 0xdf, 0xe8, 0xc8, 0xff, 0xa6, 0x91, 0x14, 0x23, 0x7a, 0x4d, 0x3a, 0xd, 0x54, 0x63, 0xe6, 0xd1, 0x88, 0xbf, 0x9f, 0xa8, 0xf1, 0xc6, 0x43, 0x74, 0x2d, 0x1a},
+ {0x0, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x5, 0x4d, 0x75, 0xa7, 0x9f, 0xd7, 0xef, 0x47, 0x7f, 0x37, 0xf, 0x7a, 0x42, 0xa, 0x32, 0x9a, 0xa2, 0xea, 0xd2, 0x53, 0x6b, 0x23, 0x1b, 0xb3, 0x8b, 0xc3, 0xfb, 0x8e, 0xb6, 0xfe, 0xc6, 0x6e, 0x56, 0x1e, 0x26, 0xf4, 0xcc, 0x84, 0xbc, 0x14, 0x2c, 0x64, 0x5c, 0x29, 0x11, 0x59, 0x61, 0xc9, 0xf1, 0xb9, 0x81, 0xa6, 0x9e, 0xd6, 0xee, 0x46, 0x7e, 0x36, 0xe, 0x7b, 0x43, 0xb, 0x33, 0x9b, 0xa3, 0xeb, 0xd3, 0x1, 0x39, 0x71, 0x49, 0xe1, 0xd9, 0x91, 0xa9, 0xdc, 0xe4, 0xac, 0x94, 0x3c, 0x4, 0x4c, 0x74, 0xf5, 0xcd, 0x85, 0xbd, 0x15, 0x2d, 0x65, 0x5d, 0x28, 0x10, 0x58, 0x60, 0xc8, 0xf0, 0xb8, 0x80, 0x52, 0x6a, 0x22, 0x1a, 0xb2, 0x8a, 0xc2, 0xfa, 0x8f, 0xb7, 0xff, 0xc7, 0x6f, 0x57, 0x1f, 0x27, 0x51, 0x69, 0x21, 0x19, 0xb1, 0x89, 0xc1, 0xf9, 0x8c, 0xb4, 0xfc, 0xc4, 0x6c, 0x54, 0x1c, 0x24, 0xf6, 0xce, 0x86, 0xbe, 0x16, 0x2e, 0x66, 0x5e, 0x2b, 0x13, 0x5b, 0x63, 0xcb, 0xf3, 0xbb, 0x83, 0x2, 0x3a, 0x72, 0x4a, 0xe2, 0xda, 0x92, 0xaa, 0xdf, 0xe7, 0xaf, 0x97, 0x3f, 0x7, 0x4f, 0x77, 0xa5, 0x9d, 0xd5, 0xed, 0x45, 0x7d, 0x35, 0xd, 0x78, 0x40, 0x8, 0x30, 0x98, 0xa0, 0xe8, 0xd0, 0xf7, 0xcf, 0x87, 0xbf, 0x17, 0x2f, 0x67, 0x5f, 0x2a, 0x12, 0x5a, 0x62, 0xca, 0xf2, 0xba, 0x82, 0x50, 0x68, 0x20, 0x18, 0xb0, 0x88, 0xc0, 0xf8, 0x8d, 0xb5, 0xfd, 0xc5, 0x6d, 0x55, 0x1d, 0x25, 0xa4, 0x9c, 0xd4, 0xec, 0x44, 0x7c, 0x34, 0xc, 0x79, 0x41, 0x9, 0x31, 0x99, 0xa1, 0xe9, 0xd1, 0x3, 0x3b, 0x73, 0x4b, 0xe3, 0xdb, 0x93, 0xab, 0xde, 0xe6, 0xae, 0x96, 0x3e, 0x6, 0x4e, 0x76},
+ {0x0, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x8, 0x43, 0x7a, 0xb7, 0x8e, 0xc5, 0xfc, 0x53, 0x6a, 0x21, 0x18, 0x62, 0x5b, 0x10, 0x29, 0x86, 0xbf, 0xf4, 0xcd, 0x73, 0x4a, 0x1, 0x38, 0x97, 0xae, 0xe5, 0xdc, 0xa6, 0x9f, 0xd4, 0xed, 0x42, 0x7b, 0x30, 0x9, 0xc4, 0xfd, 0xb6, 0x8f, 0x20, 0x19, 0x52, 0x6b, 0x11, 0x28, 0x63, 0x5a, 0xf5, 0xcc, 0x87, 0xbe, 0xe6, 0xdf, 0x94, 0xad, 0x2, 0x3b, 0x70, 0x49, 0x33, 0xa, 0x41, 0x78, 0xd7, 0xee, 0xa5, 0x9c, 0x51, 0x68, 0x23, 0x1a, 0xb5, 0x8c, 0xc7, 0xfe, 0x84, 0xbd, 0xf6, 0xcf, 0x60, 0x59, 0x12, 0x2b, 0x95, 0xac, 0xe7, 0xde, 0x71, 0x48, 0x3, 0x3a, 0x40, 0x79, 0x32, 0xb, 0xa4, 0x9d, 0xd6, 0xef, 0x22, 0x1b, 0x50, 0x69, 0xc6, 0xff, 0xb4, 0x8d, 0xf7, 0xce, 0x85, 0xbc, 0x13, 0x2a, 0x61, 0x58, 0xd1, 0xe8, 0xa3, 0x9a, 0x35, 0xc, 0x47, 0x7e, 0x4, 0x3d, 0x76, 0x4f, 0xe0, 0xd9, 0x92, 0xab, 0x66, 0x5f, 0x14, 0x2d, 0x82, 0xbb, 0xf0, 0xc9, 0xb3, 0x8a, 0xc1, 0xf8, 0x57, 0x6e, 0x25, 0x1c, 0xa2, 0x9b, 0xd0, 0xe9, 0x46, 0x7f, 0x34, 0xd, 0x77, 0x4e, 0x5, 0x3c, 0x93, 0xaa, 0xe1, 0xd8, 0x15, 0x2c, 0x67, 0x5e, 0xf1, 0xc8, 0x83, 0xba, 0xc0, 0xf9, 0xb2, 0x8b, 0x24, 0x1d, 0x56, 0x6f, 0x37, 0xe, 0x45, 0x7c, 0xd3, 0xea, 0xa1, 0x98, 0xe2, 0xdb, 0x90, 0xa9, 0x6, 0x3f, 0x74, 0x4d, 0x80, 0xb9, 0xf2, 0xcb, 0x64, 0x5d, 0x16, 0x2f, 0x55, 0x6c, 0x27, 0x1e, 0xb1, 0x88, 0xc3, 0xfa, 0x44, 0x7d, 0x36, 0xf, 0xa0, 0x99, 0xd2, 0xeb, 0x91, 0xa8, 0xe3, 0xda, 0x75, 0x4c, 0x7, 0x3e, 0xf3, 0xca, 0x81, 0xb8, 0x17, 0x2e, 0x65, 0x5c, 0x26, 0x1f, 0x54, 0x6d, 0xc2, 0xfb, 0xb0, 0x89},
+ {0x0, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b, 0x87, 0xbd, 0xf3, 0xc9, 0x6f, 0x55, 0x1b, 0x21, 0x4a, 0x70, 0x3e, 0x4, 0xa2, 0x98, 0xd6, 0xec, 0x13, 0x29, 0x67, 0x5d, 0xfb, 0xc1, 0x8f, 0xb5, 0xde, 0xe4, 0xaa, 0x90, 0x36, 0xc, 0x42, 0x78, 0x94, 0xae, 0xe0, 0xda, 0x7c, 0x46, 0x8, 0x32, 0x59, 0x63, 0x2d, 0x17, 0xb1, 0x8b, 0xc5, 0xff, 0x26, 0x1c, 0x52, 0x68, 0xce, 0xf4, 0xba, 0x80, 0xeb, 0xd1, 0x9f, 0xa5, 0x3, 0x39, 0x77, 0x4d, 0xa1, 0x9b, 0xd5, 0xef, 0x49, 0x73, 0x3d, 0x7, 0x6c, 0x56, 0x18, 0x22, 0x84, 0xbe, 0xf0, 0xca, 0x35, 0xf, 0x41, 0x7b, 0xdd, 0xe7, 0xa9, 0x93, 0xf8, 0xc2, 0x8c, 0xb6, 0x10, 0x2a, 0x64, 0x5e, 0xb2, 0x88, 0xc6, 0xfc, 0x5a, 0x60, 0x2e, 0x14, 0x7f, 0x45, 0xb, 0x31, 0x97, 0xad, 0xe3, 0xd9, 0x4c, 0x76, 0x38, 0x2, 0xa4, 0x9e, 0xd0, 0xea, 0x81, 0xbb, 0xf5, 0xcf, 0x69, 0x53, 0x1d, 0x27, 0xcb, 0xf1, 0xbf, 0x85, 0x23, 0x19, 0x57, 0x6d, 0x6, 0x3c, 0x72, 0x48, 0xee, 0xd4, 0x9a, 0xa0, 0x5f, 0x65, 0x2b, 0x11, 0xb7, 0x8d, 0xc3, 0xf9, 0x92, 0xa8, 0xe6, 0xdc, 0x7a, 0x40, 0xe, 0x34, 0xd8, 0xe2, 0xac, 0x96, 0x30, 0xa, 0x44, 0x7e, 0x15, 0x2f, 0x61, 0x5b, 0xfd, 0xc7, 0x89, 0xb3, 0x6a, 0x50, 0x1e, 0x24, 0x82, 0xb8, 0xf6, 0xcc, 0xa7, 0x9d, 0xd3, 0xe9, 0x4f, 0x75, 0x3b, 0x1, 0xed, 0xd7, 0x99, 0xa3, 0x5, 0x3f, 0x71, 0x4b, 0x20, 0x1a, 0x54, 0x6e, 0xc8, 0xf2, 0xbc, 0x86, 0x79, 0x43, 0xd, 0x37, 0x91, 0xab, 0xe5, 0xdf, 0xb4, 0x8e, 0xc0, 0xfa, 0x5c, 0x66, 0x28, 0x12, 0xfe, 0xc4, 0x8a, 0xb0, 0x16, 0x2c, 0x62, 0x58, 0x33, 0x9, 0x47, 0x7d, 0xdb, 0xe1, 0xaf, 0x95},
+ {0x0, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64, 0x97, 0xac, 0xe1, 0xda, 0x7b, 0x40, 0xd, 0x36, 0x52, 0x69, 0x24, 0x1f, 0xbe, 0x85, 0xc8, 0xf3, 0x33, 0x8, 0x45, 0x7e, 0xdf, 0xe4, 0xa9, 0x92, 0xf6, 0xcd, 0x80, 0xbb, 0x1a, 0x21, 0x6c, 0x57, 0xa4, 0x9f, 0xd2, 0xe9, 0x48, 0x73, 0x3e, 0x5, 0x61, 0x5a, 0x17, 0x2c, 0x8d, 0xb6, 0xfb, 0xc0, 0x66, 0x5d, 0x10, 0x2b, 0x8a, 0xb1, 0xfc, 0xc7, 0xa3, 0x98, 0xd5, 0xee, 0x4f, 0x74, 0x39, 0x2, 0xf1, 0xca, 0x87, 0xbc, 0x1d, 0x26, 0x6b, 0x50, 0x34, 0xf, 0x42, 0x79, 0xd8, 0xe3, 0xae, 0x95, 0x55, 0x6e, 0x23, 0x18, 0xb9, 0x82, 0xcf, 0xf4, 0x90, 0xab, 0xe6, 0xdd, 0x7c, 0x47, 0xa, 0x31, 0xc2, 0xf9, 0xb4, 0x8f, 0x2e, 0x15, 0x58, 0x63, 0x7, 0x3c, 0x71, 0x4a, 0xeb, 0xd0, 0x9d, 0xa6, 0xcc, 0xf7, 0xba, 0x81, 0x20, 0x1b, 0x56, 0x6d, 0x9, 0x32, 0x7f, 0x44, 0xe5, 0xde, 0x93, 0xa8, 0x5b, 0x60, 0x2d, 0x16, 0xb7, 0x8c, 0xc1, 0xfa, 0x9e, 0xa5, 0xe8, 0xd3, 0x72, 0x49, 0x4, 0x3f, 0xff, 0xc4, 0x89, 0xb2, 0x13, 0x28, 0x65, 0x5e, 0x3a, 0x1, 0x4c, 0x77, 0xd6, 0xed, 0xa0, 0x9b, 0x68, 0x53, 0x1e, 0x25, 0x84, 0xbf, 0xf2, 0xc9, 0xad, 0x96, 0xdb, 0xe0, 0x41, 0x7a, 0x37, 0xc, 0xaa, 0x91, 0xdc, 0xe7, 0x46, 0x7d, 0x30, 0xb, 0x6f, 0x54, 0x19, 0x22, 0x83, 0xb8, 0xf5, 0xce, 0x3d, 0x6, 0x4b, 0x70, 0xd1, 0xea, 0xa7, 0x9c, 0xf8, 0xc3, 0x8e, 0xb5, 0x14, 0x2f, 0x62, 0x59, 0x99, 0xa2, 0xef, 0xd4, 0x75, 0x4e, 0x3, 0x38, 0x5c, 0x67, 0x2a, 0x11, 0xb0, 0x8b, 0xc6, 0xfd, 0xe, 0x35, 0x78, 0x43, 0xe2, 0xd9, 0x94, 0xaf, 0xcb, 0xf0, 0xbd, 0x86, 0x27, 0x1c, 0x51, 0x6a},
+ {0x0, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0xd, 0x31, 0x75, 0x49, 0xe7, 0xdb, 0x9f, 0xa3, 0x17, 0x2b, 0x6f, 0x53, 0x1a, 0x26, 0x62, 0x5e, 0xea, 0xd6, 0x92, 0xae, 0xd3, 0xef, 0xab, 0x97, 0x23, 0x1f, 0x5b, 0x67, 0x2e, 0x12, 0x56, 0x6a, 0xde, 0xe2, 0xa6, 0x9a, 0x34, 0x8, 0x4c, 0x70, 0xc4, 0xf8, 0xbc, 0x80, 0xc9, 0xf5, 0xb1, 0x8d, 0x39, 0x5, 0x41, 0x7d, 0xbb, 0x87, 0xc3, 0xff, 0x4b, 0x77, 0x33, 0xf, 0x46, 0x7a, 0x3e, 0x2, 0xb6, 0x8a, 0xce, 0xf2, 0x5c, 0x60, 0x24, 0x18, 0xac, 0x90, 0xd4, 0xe8, 0xa1, 0x9d, 0xd9, 0xe5, 0x51, 0x6d, 0x29, 0x15, 0x68, 0x54, 0x10, 0x2c, 0x98, 0xa4, 0xe0, 0xdc, 0x95, 0xa9, 0xed, 0xd1, 0x65, 0x59, 0x1d, 0x21, 0x8f, 0xb3, 0xf7, 0xcb, 0x7f, 0x43, 0x7, 0x3b, 0x72, 0x4e, 0xa, 0x36, 0x82, 0xbe, 0xfa, 0xc6, 0x6b, 0x57, 0x13, 0x2f, 0x9b, 0xa7, 0xe3, 0xdf, 0x96, 0xaa, 0xee, 0xd2, 0x66, 0x5a, 0x1e, 0x22, 0x8c, 0xb0, 0xf4, 0xc8, 0x7c, 0x40, 0x4, 0x38, 0x71, 0x4d, 0x9, 0x35, 0x81, 0xbd, 0xf9, 0xc5, 0xb8, 0x84, 0xc0, 0xfc, 0x48, 0x74, 0x30, 0xc, 0x45, 0x79, 0x3d, 0x1, 0xb5, 0x89, 0xcd, 0xf1, 0x5f, 0x63, 0x27, 0x1b, 0xaf, 0x93, 0xd7, 0xeb, 0xa2, 0x9e, 0xda, 0xe6, 0x52, 0x6e, 0x2a, 0x16, 0xd0, 0xec, 0xa8, 0x94, 0x20, 0x1c, 0x58, 0x64, 0x2d, 0x11, 0x55, 0x69, 0xdd, 0xe1, 0xa5, 0x99, 0x37, 0xb, 0x4f, 0x73, 0xc7, 0xfb, 0xbf, 0x83, 0xca, 0xf6, 0xb2, 0x8e, 0x3a, 0x6, 0x42, 0x7e, 0x3, 0x3f, 0x7b, 0x47, 0xf3, 0xcf, 0x8b, 0xb7, 0xfe, 0xc2, 0x86, 0xba, 0xe, 0x32, 0x76, 0x4a, 0xe4, 0xd8, 0x9c, 0xa0, 0x14, 0x28, 0x6c, 0x50, 0x19, 0x25, 0x61, 0x5d, 0xe9, 0xd5, 0x91, 0xad},
+ {0x0, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x1, 0x3c, 0x7b, 0x46, 0xf7, 0xca, 0x8d, 0xb0, 0x3, 0x3e, 0x79, 0x44, 0x2, 0x3f, 0x78, 0x45, 0xf6, 0xcb, 0x8c, 0xb1, 0xf3, 0xce, 0x89, 0xb4, 0x7, 0x3a, 0x7d, 0x40, 0x6, 0x3b, 0x7c, 0x41, 0xf2, 0xcf, 0x88, 0xb5, 0x4, 0x39, 0x7e, 0x43, 0xf0, 0xcd, 0x8a, 0xb7, 0xf1, 0xcc, 0x8b, 0xb6, 0x5, 0x38, 0x7f, 0x42, 0xfb, 0xc6, 0x81, 0xbc, 0xf, 0x32, 0x75, 0x48, 0xe, 0x33, 0x74, 0x49, 0xfa, 0xc7, 0x80, 0xbd, 0xc, 0x31, 0x76, 0x4b, 0xf8, 0xc5, 0x82, 0xbf, 0xf9, 0xc4, 0x83, 0xbe, 0xd, 0x30, 0x77, 0x4a, 0x8, 0x35, 0x72, 0x4f, 0xfc, 0xc1, 0x86, 0xbb, 0xfd, 0xc0, 0x87, 0xba, 0x9, 0x34, 0x73, 0x4e, 0xff, 0xc2, 0x85, 0xb8, 0xb, 0x36, 0x71, 0x4c, 0xa, 0x37, 0x70, 0x4d, 0xfe, 0xc3, 0x84, 0xb9, 0xeb, 0xd6, 0x91, 0xac, 0x1f, 0x22, 0x65, 0x58, 0x1e, 0x23, 0x64, 0x59, 0xea, 0xd7, 0x90, 0xad, 0x1c, 0x21, 0x66, 0x5b, 0xe8, 0xd5, 0x92, 0xaf, 0xe9, 0xd4, 0x93, 0xae, 0x1d, 0x20, 0x67, 0x5a, 0x18, 0x25, 0x62, 0x5f, 0xec, 0xd1, 0x96, 0xab, 0xed, 0xd0, 0x97, 0xaa, 0x19, 0x24, 0x63, 0x5e, 0xef, 0xd2, 0x95, 0xa8, 0x1b, 0x26, 0x61, 0x5c, 0x1a, 0x27, 0x60, 0x5d, 0xee, 0xd3, 0x94, 0xa9, 0x10, 0x2d, 0x6a, 0x57, 0xe4, 0xd9, 0x9e, 0xa3, 0xe5, 0xd8, 0x9f, 0xa2, 0x11, 0x2c, 0x6b, 0x56, 0xe7, 0xda, 0x9d, 0xa0, 0x13, 0x2e, 0x69, 0x54, 0x12, 0x2f, 0x68, 0x55, 0xe6, 0xdb, 0x9c, 0xa1, 0xe3, 0xde, 0x99, 0xa4, 0x17, 0x2a, 0x6d, 0x50, 0x16, 0x2b, 0x6c, 0x51, 0xe2, 0xdf, 0x98, 0xa5, 0x14, 0x29, 0x6e, 0x53, 0xe0, 0xdd, 0x9a, 0xa7, 0xe1, 0xdc, 0x9b, 0xa6, 0x15, 0x28, 0x6f, 0x52},
+ {0x0, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57, 0xc7, 0xf9, 0xbb, 0x85, 0x3f, 0x1, 0x43, 0x7d, 0x2a, 0x14, 0x56, 0x68, 0xd2, 0xec, 0xae, 0x90, 0x93, 0xad, 0xef, 0xd1, 0x6b, 0x55, 0x17, 0x29, 0x7e, 0x40, 0x2, 0x3c, 0x86, 0xb8, 0xfa, 0xc4, 0x54, 0x6a, 0x28, 0x16, 0xac, 0x92, 0xd0, 0xee, 0xb9, 0x87, 0xc5, 0xfb, 0x41, 0x7f, 0x3d, 0x3, 0x3b, 0x5, 0x47, 0x79, 0xc3, 0xfd, 0xbf, 0x81, 0xd6, 0xe8, 0xaa, 0x94, 0x2e, 0x10, 0x52, 0x6c, 0xfc, 0xc2, 0x80, 0xbe, 0x4, 0x3a, 0x78, 0x46, 0x11, 0x2f, 0x6d, 0x53, 0xe9, 0xd7, 0x95, 0xab, 0xa8, 0x96, 0xd4, 0xea, 0x50, 0x6e, 0x2c, 0x12, 0x45, 0x7b, 0x39, 0x7, 0xbd, 0x83, 0xc1, 0xff, 0x6f, 0x51, 0x13, 0x2d, 0x97, 0xa9, 0xeb, 0xd5, 0x82, 0xbc, 0xfe, 0xc0, 0x7a, 0x44, 0x6, 0x38, 0x76, 0x48, 0xa, 0x34, 0x8e, 0xb0, 0xf2, 0xcc, 0x9b, 0xa5, 0xe7, 0xd9, 0x63, 0x5d, 0x1f, 0x21, 0xb1, 0x8f, 0xcd, 0xf3, 0x49, 0x77, 0x35, 0xb, 0x5c, 0x62, 0x20, 0x1e, 0xa4, 0x9a, 0xd8, 0xe6, 0xe5, 0xdb, 0x99, 0xa7, 0x1d, 0x23, 0x61, 0x5f, 0x8, 0x36, 0x74, 0x4a, 0xf0, 0xce, 0x8c, 0xb2, 0x22, 0x1c, 0x5e, 0x60, 0xda, 0xe4, 0xa6, 0x98, 0xcf, 0xf1, 0xb3, 0x8d, 0x37, 0x9, 0x4b, 0x75, 0x4d, 0x73, 0x31, 0xf, 0xb5, 0x8b, 0xc9, 0xf7, 0xa0, 0x9e, 0xdc, 0xe2, 0x58, 0x66, 0x24, 0x1a, 0x8a, 0xb4, 0xf6, 0xc8, 0x72, 0x4c, 0xe, 0x30, 0x67, 0x59, 0x1b, 0x25, 0x9f, 0xa1, 0xe3, 0xdd, 0xde, 0xe0, 0xa2, 0x9c, 0x26, 0x18, 0x5a, 0x64, 0x33, 0xd, 0x4f, 0x71, 0xcb, 0xf5, 0xb7, 0x89, 0x19, 0x27, 0x65, 0x5b, 0xe1, 0xdf, 0x9d, 0xa3, 0xf4, 0xca, 0x88, 0xb6, 0xc, 0x32, 0x70, 0x4e},
+ {0x0, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58, 0xd7, 0xe8, 0xa9, 0x96, 0x2b, 0x14, 0x55, 0x6a, 0x32, 0xd, 0x4c, 0x73, 0xce, 0xf1, 0xb0, 0x8f, 0xb3, 0x8c, 0xcd, 0xf2, 0x4f, 0x70, 0x31, 0xe, 0x56, 0x69, 0x28, 0x17, 0xaa, 0x95, 0xd4, 0xeb, 0x64, 0x5b, 0x1a, 0x25, 0x98, 0xa7, 0xe6, 0xd9, 0x81, 0xbe, 0xff, 0xc0, 0x7d, 0x42, 0x3, 0x3c, 0x7b, 0x44, 0x5, 0x3a, 0x87, 0xb8, 0xf9, 0xc6, 0x9e, 0xa1, 0xe0, 0xdf, 0x62, 0x5d, 0x1c, 0x23, 0xac, 0x93, 0xd2, 0xed, 0x50, 0x6f, 0x2e, 0x11, 0x49, 0x76, 0x37, 0x8, 0xb5, 0x8a, 0xcb, 0xf4, 0xc8, 0xf7, 0xb6, 0x89, 0x34, 0xb, 0x4a, 0x75, 0x2d, 0x12, 0x53, 0x6c, 0xd1, 0xee, 0xaf, 0x90, 0x1f, 0x20, 0x61, 0x5e, 0xe3, 0xdc, 0x9d, 0xa2, 0xfa, 0xc5, 0x84, 0xbb, 0x6, 0x39, 0x78, 0x47, 0xf6, 0xc9, 0x88, 0xb7, 0xa, 0x35, 0x74, 0x4b, 0x13, 0x2c, 0x6d, 0x52, 0xef, 0xd0, 0x91, 0xae, 0x21, 0x1e, 0x5f, 0x60, 0xdd, 0xe2, 0xa3, 0x9c, 0xc4, 0xfb, 0xba, 0x85, 0x38, 0x7, 0x46, 0x79, 0x45, 0x7a, 0x3b, 0x4, 0xb9, 0x86, 0xc7, 0xf8, 0xa0, 0x9f, 0xde, 0xe1, 0x5c, 0x63, 0x22, 0x1d, 0x92, 0xad, 0xec, 0xd3, 0x6e, 0x51, 0x10, 0x2f, 0x77, 0x48, 0x9, 0x36, 0x8b, 0xb4, 0xf5, 0xca, 0x8d, 0xb2, 0xf3, 0xcc, 0x71, 0x4e, 0xf, 0x30, 0x68, 0x57, 0x16, 0x29, 0x94, 0xab, 0xea, 0xd5, 0x5a, 0x65, 0x24, 0x1b, 0xa6, 0x99, 0xd8, 0xe7, 0xbf, 0x80, 0xc1, 0xfe, 0x43, 0x7c, 0x3d, 0x2, 0x3e, 0x1, 0x40, 0x7f, 0xc2, 0xfd, 0xbc, 0x83, 0xdb, 0xe4, 0xa5, 0x9a, 0x27, 0x18, 0x59, 0x66, 0xe9, 0xd6, 0x97, 0xa8, 0x15, 0x2a, 0x6b, 0x54, 0xc, 0x33, 0x72, 0x4d, 0xf0, 0xcf, 0x8e, 0xb1},
+ {0x0, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7, 0x74, 0x34, 0xf4, 0xb4, 0x69, 0x29, 0xe9, 0xa9, 0x4e, 0xe, 0xce, 0x8e, 0x53, 0x13, 0xd3, 0x93, 0xe8, 0xa8, 0x68, 0x28, 0xf5, 0xb5, 0x75, 0x35, 0xd2, 0x92, 0x52, 0x12, 0xcf, 0x8f, 0x4f, 0xf, 0x9c, 0xdc, 0x1c, 0x5c, 0x81, 0xc1, 0x1, 0x41, 0xa6, 0xe6, 0x26, 0x66, 0xbb, 0xfb, 0x3b, 0x7b, 0xcd, 0x8d, 0x4d, 0xd, 0xd0, 0x90, 0x50, 0x10, 0xf7, 0xb7, 0x77, 0x37, 0xea, 0xaa, 0x6a, 0x2a, 0xb9, 0xf9, 0x39, 0x79, 0xa4, 0xe4, 0x24, 0x64, 0x83, 0xc3, 0x3, 0x43, 0x9e, 0xde, 0x1e, 0x5e, 0x25, 0x65, 0xa5, 0xe5, 0x38, 0x78, 0xb8, 0xf8, 0x1f, 0x5f, 0x9f, 0xdf, 0x2, 0x42, 0x82, 0xc2, 0x51, 0x11, 0xd1, 0x91, 0x4c, 0xc, 0xcc, 0x8c, 0x6b, 0x2b, 0xeb, 0xab, 0x76, 0x36, 0xf6, 0xb6, 0x87, 0xc7, 0x7, 0x47, 0x9a, 0xda, 0x1a, 0x5a, 0xbd, 0xfd, 0x3d, 0x7d, 0xa0, 0xe0, 0x20, 0x60, 0xf3, 0xb3, 0x73, 0x33, 0xee, 0xae, 0x6e, 0x2e, 0xc9, 0x89, 0x49, 0x9, 0xd4, 0x94, 0x54, 0x14, 0x6f, 0x2f, 0xef, 0xaf, 0x72, 0x32, 0xf2, 0xb2, 0x55, 0x15, 0xd5, 0x95, 0x48, 0x8, 0xc8, 0x88, 0x1b, 0x5b, 0x9b, 0xdb, 0x6, 0x46, 0x86, 0xc6, 0x21, 0x61, 0xa1, 0xe1, 0x3c, 0x7c, 0xbc, 0xfc, 0x4a, 0xa, 0xca, 0x8a, 0x57, 0x17, 0xd7, 0x97, 0x70, 0x30, 0xf0, 0xb0, 0x6d, 0x2d, 0xed, 0xad, 0x3e, 0x7e, 0xbe, 0xfe, 0x23, 0x63, 0xa3, 0xe3, 0x4, 0x44, 0x84, 0xc4, 0x19, 0x59, 0x99, 0xd9, 0xa2, 0xe2, 0x22, 0x62, 0xbf, 0xff, 0x3f, 0x7f, 0x98, 0xd8, 0x18, 0x58, 0x85, 0xc5, 0x5, 0x45, 0xd6, 0x96, 0x56, 0x16, 0xcb, 0x8b, 0x4b, 0xb, 0xec, 0xac, 0x6c, 0x2c, 0xf1, 0xb1, 0x71, 0x31},
+ {0x0, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8, 0x64, 0x25, 0xe6, 0xa7, 0x7d, 0x3c, 0xff, 0xbe, 0x56, 0x17, 0xd4, 0x95, 0x4f, 0xe, 0xcd, 0x8c, 0xc8, 0x89, 0x4a, 0xb, 0xd1, 0x90, 0x53, 0x12, 0xfa, 0xbb, 0x78, 0x39, 0xe3, 0xa2, 0x61, 0x20, 0xac, 0xed, 0x2e, 0x6f, 0xb5, 0xf4, 0x37, 0x76, 0x9e, 0xdf, 0x1c, 0x5d, 0x87, 0xc6, 0x5, 0x44, 0x8d, 0xcc, 0xf, 0x4e, 0x94, 0xd5, 0x16, 0x57, 0xbf, 0xfe, 0x3d, 0x7c, 0xa6, 0xe7, 0x24, 0x65, 0xe9, 0xa8, 0x6b, 0x2a, 0xf0, 0xb1, 0x72, 0x33, 0xdb, 0x9a, 0x59, 0x18, 0xc2, 0x83, 0x40, 0x1, 0x45, 0x4, 0xc7, 0x86, 0x5c, 0x1d, 0xde, 0x9f, 0x77, 0x36, 0xf5, 0xb4, 0x6e, 0x2f, 0xec, 0xad, 0x21, 0x60, 0xa3, 0xe2, 0x38, 0x79, 0xba, 0xfb, 0x13, 0x52, 0x91, 0xd0, 0xa, 0x4b, 0x88, 0xc9, 0x7, 0x46, 0x85, 0xc4, 0x1e, 0x5f, 0x9c, 0xdd, 0x35, 0x74, 0xb7, 0xf6, 0x2c, 0x6d, 0xae, 0xef, 0x63, 0x22, 0xe1, 0xa0, 0x7a, 0x3b, 0xf8, 0xb9, 0x51, 0x10, 0xd3, 0x92, 0x48, 0x9, 0xca, 0x8b, 0xcf, 0x8e, 0x4d, 0xc, 0xd6, 0x97, 0x54, 0x15, 0xfd, 0xbc, 0x7f, 0x3e, 0xe4, 0xa5, 0x66, 0x27, 0xab, 0xea, 0x29, 0x68, 0xb2, 0xf3, 0x30, 0x71, 0x99, 0xd8, 0x1b, 0x5a, 0x80, 0xc1, 0x2, 0x43, 0x8a, 0xcb, 0x8, 0x49, 0x93, 0xd2, 0x11, 0x50, 0xb8, 0xf9, 0x3a, 0x7b, 0xa1, 0xe0, 0x23, 0x62, 0xee, 0xaf, 0x6c, 0x2d, 0xf7, 0xb6, 0x75, 0x34, 0xdc, 0x9d, 0x5e, 0x1f, 0xc5, 0x84, 0x47, 0x6, 0x42, 0x3, 0xc0, 0x81, 0x5b, 0x1a, 0xd9, 0x98, 0x70, 0x31, 0xf2, 0xb3, 0x69, 0x28, 0xeb, 0xaa, 0x26, 0x67, 0xa4, 0xe5, 0x3f, 0x7e, 0xbd, 0xfc, 0x14, 0x55, 0x96, 0xd7, 0xd, 0x4c, 0x8f, 0xce},
+ {0x0, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9, 0x54, 0x16, 0xd0, 0x92, 0x41, 0x3, 0xc5, 0x87, 0x7e, 0x3c, 0xfa, 0xb8, 0x6b, 0x29, 0xef, 0xad, 0xa8, 0xea, 0x2c, 0x6e, 0xbd, 0xff, 0x39, 0x7b, 0x82, 0xc0, 0x6, 0x44, 0x97, 0xd5, 0x13, 0x51, 0xfc, 0xbe, 0x78, 0x3a, 0xe9, 0xab, 0x6d, 0x2f, 0xd6, 0x94, 0x52, 0x10, 0xc3, 0x81, 0x47, 0x5, 0x4d, 0xf, 0xc9, 0x8b, 0x58, 0x1a, 0xdc, 0x9e, 0x67, 0x25, 0xe3, 0xa1, 0x72, 0x30, 0xf6, 0xb4, 0x19, 0x5b, 0x9d, 0xdf, 0xc, 0x4e, 0x88, 0xca, 0x33, 0x71, 0xb7, 0xf5, 0x26, 0x64, 0xa2, 0xe0, 0xe5, 0xa7, 0x61, 0x23, 0xf0, 0xb2, 0x74, 0x36, 0xcf, 0x8d, 0x4b, 0x9, 0xda, 0x98, 0x5e, 0x1c, 0xb1, 0xf3, 0x35, 0x77, 0xa4, 0xe6, 0x20, 0x62, 0x9b, 0xd9, 0x1f, 0x5d, 0x8e, 0xcc, 0xa, 0x48, 0x9a, 0xd8, 0x1e, 0x5c, 0x8f, 0xcd, 0xb, 0x49, 0xb0, 0xf2, 0x34, 0x76, 0xa5, 0xe7, 0x21, 0x63, 0xce, 0x8c, 0x4a, 0x8, 0xdb, 0x99, 0x5f, 0x1d, 0xe4, 0xa6, 0x60, 0x22, 0xf1, 0xb3, 0x75, 0x37, 0x32, 0x70, 0xb6, 0xf4, 0x27, 0x65, 0xa3, 0xe1, 0x18, 0x5a, 0x9c, 0xde, 0xd, 0x4f, 0x89, 0xcb, 0x66, 0x24, 0xe2, 0xa0, 0x73, 0x31, 0xf7, 0xb5, 0x4c, 0xe, 0xc8, 0x8a, 0x59, 0x1b, 0xdd, 0x9f, 0xd7, 0x95, 0x53, 0x11, 0xc2, 0x80, 0x46, 0x4, 0xfd, 0xbf, 0x79, 0x3b, 0xe8, 0xaa, 0x6c, 0x2e, 0x83, 0xc1, 0x7, 0x45, 0x96, 0xd4, 0x12, 0x50, 0xa9, 0xeb, 0x2d, 0x6f, 0xbc, 0xfe, 0x38, 0x7a, 0x7f, 0x3d, 0xfb, 0xb9, 0x6a, 0x28, 0xee, 0xac, 0x55, 0x17, 0xd1, 0x93, 0x40, 0x2, 0xc4, 0x86, 0x2b, 0x69, 0xaf, 0xed, 0x3e, 0x7c, 0xba, 0xf8, 0x1, 0x43, 0x85, 0xc7, 0x14, 0x56, 0x90, 0xd2},
+ {0x0, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6, 0x44, 0x7, 0xc2, 0x81, 0x55, 0x16, 0xd3, 0x90, 0x66, 0x25, 0xe0, 0xa3, 0x77, 0x34, 0xf1, 0xb2, 0x88, 0xcb, 0xe, 0x4d, 0x99, 0xda, 0x1f, 0x5c, 0xaa, 0xe9, 0x2c, 0x6f, 0xbb, 0xf8, 0x3d, 0x7e, 0xcc, 0x8f, 0x4a, 0x9, 0xdd, 0x9e, 0x5b, 0x18, 0xee, 0xad, 0x68, 0x2b, 0xff, 0xbc, 0x79, 0x3a, 0xd, 0x4e, 0x8b, 0xc8, 0x1c, 0x5f, 0x9a, 0xd9, 0x2f, 0x6c, 0xa9, 0xea, 0x3e, 0x7d, 0xb8, 0xfb, 0x49, 0xa, 0xcf, 0x8c, 0x58, 0x1b, 0xde, 0x9d, 0x6b, 0x28, 0xed, 0xae, 0x7a, 0x39, 0xfc, 0xbf, 0x85, 0xc6, 0x3, 0x40, 0x94, 0xd7, 0x12, 0x51, 0xa7, 0xe4, 0x21, 0x62, 0xb6, 0xf5, 0x30, 0x73, 0xc1, 0x82, 0x47, 0x4, 0xd0, 0x93, 0x56, 0x15, 0xe3, 0xa0, 0x65, 0x26, 0xf2, 0xb1, 0x74, 0x37, 0x1a, 0x59, 0x9c, 0xdf, 0xb, 0x48, 0x8d, 0xce, 0x38, 0x7b, 0xbe, 0xfd, 0x29, 0x6a, 0xaf, 0xec, 0x5e, 0x1d, 0xd8, 0x9b, 0x4f, 0xc, 0xc9, 0x8a, 0x7c, 0x3f, 0xfa, 0xb9, 0x6d, 0x2e, 0xeb, 0xa8, 0x92, 0xd1, 0x14, 0x57, 0x83, 0xc0, 0x5, 0x46, 0xb0, 0xf3, 0x36, 0x75, 0xa1, 0xe2, 0x27, 0x64, 0xd6, 0x95, 0x50, 0x13, 0xc7, 0x84, 0x41, 0x2, 0xf4, 0xb7, 0x72, 0x31, 0xe5, 0xa6, 0x63, 0x20, 0x17, 0x54, 0x91, 0xd2, 0x6, 0x45, 0x80, 0xc3, 0x35, 0x76, 0xb3, 0xf0, 0x24, 0x67, 0xa2, 0xe1, 0x53, 0x10, 0xd5, 0x96, 0x42, 0x1, 0xc4, 0x87, 0x71, 0x32, 0xf7, 0xb4, 0x60, 0x23, 0xe6, 0xa5, 0x9f, 0xdc, 0x19, 0x5a, 0x8e, 0xcd, 0x8, 0x4b, 0xbd, 0xfe, 0x3b, 0x78, 0xac, 0xef, 0x2a, 0x69, 0xdb, 0x98, 0x5d, 0x1e, 0xca, 0x89, 0x4c, 0xf, 0xf9, 0xba, 0x7f, 0x3c, 0xe8, 0xab, 0x6e, 0x2d},
+ {0x0, 0x44, 0x88, 0xcc, 0xd, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb, 0x34, 0x70, 0xbc, 0xf8, 0x39, 0x7d, 0xb1, 0xf5, 0x2e, 0x6a, 0xa6, 0xe2, 0x23, 0x67, 0xab, 0xef, 0x68, 0x2c, 0xe0, 0xa4, 0x65, 0x21, 0xed, 0xa9, 0x72, 0x36, 0xfa, 0xbe, 0x7f, 0x3b, 0xf7, 0xb3, 0x5c, 0x18, 0xd4, 0x90, 0x51, 0x15, 0xd9, 0x9d, 0x46, 0x2, 0xce, 0x8a, 0x4b, 0xf, 0xc3, 0x87, 0xd0, 0x94, 0x58, 0x1c, 0xdd, 0x99, 0x55, 0x11, 0xca, 0x8e, 0x42, 0x6, 0xc7, 0x83, 0x4f, 0xb, 0xe4, 0xa0, 0x6c, 0x28, 0xe9, 0xad, 0x61, 0x25, 0xfe, 0xba, 0x76, 0x32, 0xf3, 0xb7, 0x7b, 0x3f, 0xb8, 0xfc, 0x30, 0x74, 0xb5, 0xf1, 0x3d, 0x79, 0xa2, 0xe6, 0x2a, 0x6e, 0xaf, 0xeb, 0x27, 0x63, 0x8c, 0xc8, 0x4, 0x40, 0x81, 0xc5, 0x9, 0x4d, 0x96, 0xd2, 0x1e, 0x5a, 0x9b, 0xdf, 0x13, 0x57, 0xbd, 0xf9, 0x35, 0x71, 0xb0, 0xf4, 0x38, 0x7c, 0xa7, 0xe3, 0x2f, 0x6b, 0xaa, 0xee, 0x22, 0x66, 0x89, 0xcd, 0x1, 0x45, 0x84, 0xc0, 0xc, 0x48, 0x93, 0xd7, 0x1b, 0x5f, 0x9e, 0xda, 0x16, 0x52, 0xd5, 0x91, 0x5d, 0x19, 0xd8, 0x9c, 0x50, 0x14, 0xcf, 0x8b, 0x47, 0x3, 0xc2, 0x86, 0x4a, 0xe, 0xe1, 0xa5, 0x69, 0x2d, 0xec, 0xa8, 0x64, 0x20, 0xfb, 0xbf, 0x73, 0x37, 0xf6, 0xb2, 0x7e, 0x3a, 0x6d, 0x29, 0xe5, 0xa1, 0x60, 0x24, 0xe8, 0xac, 0x77, 0x33, 0xff, 0xbb, 0x7a, 0x3e, 0xf2, 0xb6, 0x59, 0x1d, 0xd1, 0x95, 0x54, 0x10, 0xdc, 0x98, 0x43, 0x7, 0xcb, 0x8f, 0x4e, 0xa, 0xc6, 0x82, 0x5, 0x41, 0x8d, 0xc9, 0x8, 0x4c, 0x80, 0xc4, 0x1f, 0x5b, 0x97, 0xd3, 0x12, 0x56, 0x9a, 0xde, 0x31, 0x75, 0xb9, 0xfd, 0x3c, 0x78, 0xb4, 0xf0, 0x2b, 0x6f, 0xa3, 0xe7, 0x26, 0x62, 0xae, 0xea},
+ {0x0, 0x45, 0x8a, 0xcf, 0x9, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4, 0x24, 0x61, 0xae, 0xeb, 0x2d, 0x68, 0xa7, 0xe2, 0x36, 0x73, 0xbc, 0xf9, 0x3f, 0x7a, 0xb5, 0xf0, 0x48, 0xd, 0xc2, 0x87, 0x41, 0x4, 0xcb, 0x8e, 0x5a, 0x1f, 0xd0, 0x95, 0x53, 0x16, 0xd9, 0x9c, 0x6c, 0x29, 0xe6, 0xa3, 0x65, 0x20, 0xef, 0xaa, 0x7e, 0x3b, 0xf4, 0xb1, 0x77, 0x32, 0xfd, 0xb8, 0x90, 0xd5, 0x1a, 0x5f, 0x99, 0xdc, 0x13, 0x56, 0x82, 0xc7, 0x8, 0x4d, 0x8b, 0xce, 0x1, 0x44, 0xb4, 0xf1, 0x3e, 0x7b, 0xbd, 0xf8, 0x37, 0x72, 0xa6, 0xe3, 0x2c, 0x69, 0xaf, 0xea, 0x25, 0x60, 0xd8, 0x9d, 0x52, 0x17, 0xd1, 0x94, 0x5b, 0x1e, 0xca, 0x8f, 0x40, 0x5, 0xc3, 0x86, 0x49, 0xc, 0xfc, 0xb9, 0x76, 0x33, 0xf5, 0xb0, 0x7f, 0x3a, 0xee, 0xab, 0x64, 0x21, 0xe7, 0xa2, 0x6d, 0x28, 0x3d, 0x78, 0xb7, 0xf2, 0x34, 0x71, 0xbe, 0xfb, 0x2f, 0x6a, 0xa5, 0xe0, 0x26, 0x63, 0xac, 0xe9, 0x19, 0x5c, 0x93, 0xd6, 0x10, 0x55, 0x9a, 0xdf, 0xb, 0x4e, 0x81, 0xc4, 0x2, 0x47, 0x88, 0xcd, 0x75, 0x30, 0xff, 0xba, 0x7c, 0x39, 0xf6, 0xb3, 0x67, 0x22, 0xed, 0xa8, 0x6e, 0x2b, 0xe4, 0xa1, 0x51, 0x14, 0xdb, 0x9e, 0x58, 0x1d, 0xd2, 0x97, 0x43, 0x6, 0xc9, 0x8c, 0x4a, 0xf, 0xc0, 0x85, 0xad, 0xe8, 0x27, 0x62, 0xa4, 0xe1, 0x2e, 0x6b, 0xbf, 0xfa, 0x35, 0x70, 0xb6, 0xf3, 0x3c, 0x79, 0x89, 0xcc, 0x3, 0x46, 0x80, 0xc5, 0xa, 0x4f, 0x9b, 0xde, 0x11, 0x54, 0x92, 0xd7, 0x18, 0x5d, 0xe5, 0xa0, 0x6f, 0x2a, 0xec, 0xa9, 0x66, 0x23, 0xf7, 0xb2, 0x7d, 0x38, 0xfe, 0xbb, 0x74, 0x31, 0xc1, 0x84, 0x4b, 0xe, 0xc8, 0x8d, 0x42, 0x7, 0xd3, 0x96, 0x59, 0x1c, 0xda, 0x9f, 0x50, 0x15},
+ {0x0, 0x46, 0x8c, 0xca, 0x5, 0x43, 0x89, 0xcf, 0xa, 0x4c, 0x86, 0xc0, 0xf, 0x49, 0x83, 0xc5, 0x14, 0x52, 0x98, 0xde, 0x11, 0x57, 0x9d, 0xdb, 0x1e, 0x58, 0x92, 0xd4, 0x1b, 0x5d, 0x97, 0xd1, 0x28, 0x6e, 0xa4, 0xe2, 0x2d, 0x6b, 0xa1, 0xe7, 0x22, 0x64, 0xae, 0xe8, 0x27, 0x61, 0xab, 0xed, 0x3c, 0x7a, 0xb0, 0xf6, 0x39, 0x7f, 0xb5, 0xf3, 0x36, 0x70, 0xba, 0xfc, 0x33, 0x75, 0xbf, 0xf9, 0x50, 0x16, 0xdc, 0x9a, 0x55, 0x13, 0xd9, 0x9f, 0x5a, 0x1c, 0xd6, 0x90, 0x5f, 0x19, 0xd3, 0x95, 0x44, 0x2, 0xc8, 0x8e, 0x41, 0x7, 0xcd, 0x8b, 0x4e, 0x8, 0xc2, 0x84, 0x4b, 0xd, 0xc7, 0x81, 0x78, 0x3e, 0xf4, 0xb2, 0x7d, 0x3b, 0xf1, 0xb7, 0x72, 0x34, 0xfe, 0xb8, 0x77, 0x31, 0xfb, 0xbd, 0x6c, 0x2a, 0xe0, 0xa6, 0x69, 0x2f, 0xe5, 0xa3, 0x66, 0x20, 0xea, 0xac, 0x63, 0x25, 0xef, 0xa9, 0xa0, 0xe6, 0x2c, 0x6a, 0xa5, 0xe3, 0x29, 0x6f, 0xaa, 0xec, 0x26, 0x60, 0xaf, 0xe9, 0x23, 0x65, 0xb4, 0xf2, 0x38, 0x7e, 0xb1, 0xf7, 0x3d, 0x7b, 0xbe, 0xf8, 0x32, 0x74, 0xbb, 0xfd, 0x37, 0x71, 0x88, 0xce, 0x4, 0x42, 0x8d, 0xcb, 0x1, 0x47, 0x82, 0xc4, 0xe, 0x48, 0x87, 0xc1, 0xb, 0x4d, 0x9c, 0xda, 0x10, 0x56, 0x99, 0xdf, 0x15, 0x53, 0x96, 0xd0, 0x1a, 0x5c, 0x93, 0xd5, 0x1f, 0x59, 0xf0, 0xb6, 0x7c, 0x3a, 0xf5, 0xb3, 0x79, 0x3f, 0xfa, 0xbc, 0x76, 0x30, 0xff, 0xb9, 0x73, 0x35, 0xe4, 0xa2, 0x68, 0x2e, 0xe1, 0xa7, 0x6d, 0x2b, 0xee, 0xa8, 0x62, 0x24, 0xeb, 0xad, 0x67, 0x21, 0xd8, 0x9e, 0x54, 0x12, 0xdd, 0x9b, 0x51, 0x17, 0xd2, 0x94, 0x5e, 0x18, 0xd7, 0x91, 0x5b, 0x1d, 0xcc, 0x8a, 0x40, 0x6, 0xc9, 0x8f, 0x45, 0x3, 0xc6, 0x80, 0x4a, 0xc, 0xc3, 0x85, 0x4f, 0x9},
+ {0x0, 0x47, 0x8e, 0xc9, 0x1, 0x46, 0x8f, 0xc8, 0x2, 0x45, 0x8c, 0xcb, 0x3, 0x44, 0x8d, 0xca, 0x4, 0x43, 0x8a, 0xcd, 0x5, 0x42, 0x8b, 0xcc, 0x6, 0x41, 0x88, 0xcf, 0x7, 0x40, 0x89, 0xce, 0x8, 0x4f, 0x86, 0xc1, 0x9, 0x4e, 0x87, 0xc0, 0xa, 0x4d, 0x84, 0xc3, 0xb, 0x4c, 0x85, 0xc2, 0xc, 0x4b, 0x82, 0xc5, 0xd, 0x4a, 0x83, 0xc4, 0xe, 0x49, 0x80, 0xc7, 0xf, 0x48, 0x81, 0xc6, 0x10, 0x57, 0x9e, 0xd9, 0x11, 0x56, 0x9f, 0xd8, 0x12, 0x55, 0x9c, 0xdb, 0x13, 0x54, 0x9d, 0xda, 0x14, 0x53, 0x9a, 0xdd, 0x15, 0x52, 0x9b, 0xdc, 0x16, 0x51, 0x98, 0xdf, 0x17, 0x50, 0x99, 0xde, 0x18, 0x5f, 0x96, 0xd1, 0x19, 0x5e, 0x97, 0xd0, 0x1a, 0x5d, 0x94, 0xd3, 0x1b, 0x5c, 0x95, 0xd2, 0x1c, 0x5b, 0x92, 0xd5, 0x1d, 0x5a, 0x93, 0xd4, 0x1e, 0x59, 0x90, 0xd7, 0x1f, 0x58, 0x91, 0xd6, 0x20, 0x67, 0xae, 0xe9, 0x21, 0x66, 0xaf, 0xe8, 0x22, 0x65, 0xac, 0xeb, 0x23, 0x64, 0xad, 0xea, 0x24, 0x63, 0xaa, 0xed, 0x25, 0x62, 0xab, 0xec, 0x26, 0x61, 0xa8, 0xef, 0x27, 0x60, 0xa9, 0xee, 0x28, 0x6f, 0xa6, 0xe1, 0x29, 0x6e, 0xa7, 0xe0, 0x2a, 0x6d, 0xa4, 0xe3, 0x2b, 0x6c, 0xa5, 0xe2, 0x2c, 0x6b, 0xa2, 0xe5, 0x2d, 0x6a, 0xa3, 0xe4, 0x2e, 0x69, 0xa0, 0xe7, 0x2f, 0x68, 0xa1, 0xe6, 0x30, 0x77, 0xbe, 0xf9, 0x31, 0x76, 0xbf, 0xf8, 0x32, 0x75, 0xbc, 0xfb, 0x33, 0x74, 0xbd, 0xfa, 0x34, 0x73, 0xba, 0xfd, 0x35, 0x72, 0xbb, 0xfc, 0x36, 0x71, 0xb8, 0xff, 0x37, 0x70, 0xb9, 0xfe, 0x38, 0x7f, 0xb6, 0xf1, 0x39, 0x7e, 0xb7, 0xf0, 0x3a, 0x7d, 0xb4, 0xf3, 0x3b, 0x7c, 0xb5, 0xf2, 0x3c, 0x7b, 0xb2, 0xf5, 0x3d, 0x7a, 0xb3, 0xf4, 0x3e, 0x79, 0xb0, 0xf7, 0x3f, 0x78, 0xb1, 0xf6},
+ {0x0, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0xf, 0xd7, 0x9f, 0xf4, 0xbc, 0x64, 0x2c, 0xc9, 0x81, 0x59, 0x11, 0x8e, 0xc6, 0x1e, 0x56, 0xb3, 0xfb, 0x23, 0x6b, 0xf5, 0xbd, 0x65, 0x2d, 0xc8, 0x80, 0x58, 0x10, 0x8f, 0xc7, 0x1f, 0x57, 0xb2, 0xfa, 0x22, 0x6a, 0x1, 0x49, 0x91, 0xd9, 0x3c, 0x74, 0xac, 0xe4, 0x7b, 0x33, 0xeb, 0xa3, 0x46, 0xe, 0xd6, 0x9e, 0xf7, 0xbf, 0x67, 0x2f, 0xca, 0x82, 0x5a, 0x12, 0x8d, 0xc5, 0x1d, 0x55, 0xb0, 0xf8, 0x20, 0x68, 0x3, 0x4b, 0x93, 0xdb, 0x3e, 0x76, 0xae, 0xe6, 0x79, 0x31, 0xe9, 0xa1, 0x44, 0xc, 0xd4, 0x9c, 0x2, 0x4a, 0x92, 0xda, 0x3f, 0x77, 0xaf, 0xe7, 0x78, 0x30, 0xe8, 0xa0, 0x45, 0xd, 0xd5, 0x9d, 0xf6, 0xbe, 0x66, 0x2e, 0xcb, 0x83, 0x5b, 0x13, 0x8c, 0xc4, 0x1c, 0x54, 0xb1, 0xf9, 0x21, 0x69, 0xf3, 0xbb, 0x63, 0x2b, 0xce, 0x86, 0x5e, 0x16, 0x89, 0xc1, 0x19, 0x51, 0xb4, 0xfc, 0x24, 0x6c, 0x7, 0x4f, 0x97, 0xdf, 0x3a, 0x72, 0xaa, 0xe2, 0x7d, 0x35, 0xed, 0xa5, 0x40, 0x8, 0xd0, 0x98, 0x6, 0x4e, 0x96, 0xde, 0x3b, 0x73, 0xab, 0xe3, 0x7c, 0x34, 0xec, 0xa4, 0x41, 0x9, 0xd1, 0x99, 0xf2, 0xba, 0x62, 0x2a, 0xcf, 0x87, 0x5f, 0x17, 0x88, 0xc0, 0x18, 0x50, 0xb5, 0xfd, 0x25, 0x6d, 0x4, 0x4c, 0x94, 0xdc, 0x39, 0x71, 0xa9, 0xe1, 0x7e, 0x36, 0xee, 0xa6, 0x43, 0xb, 0xd3, 0x9b, 0xf0, 0xb8, 0x60, 0x28, 0xcd, 0x85, 0x5d, 0x15, 0x8a, 0xc2, 0x1a, 0x52, 0xb7, 0xff, 0x27, 0x6f, 0xf1, 0xb9, 0x61, 0x29, 0xcc, 0x84, 0x5c, 0x14, 0x8b, 0xc3, 0x1b, 0x53, 0xb6, 0xfe, 0x26, 0x6e, 0x5, 0x4d, 0x95, 0xdd, 0x38, 0x70, 0xa8, 0xe0, 0x7f, 0x37, 0xef, 0xa7, 0x42, 0xa, 0xd2, 0x9a},
+ {0x0, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x2, 0xd9, 0x90, 0xe4, 0xad, 0x76, 0x3f, 0xdd, 0x94, 0x4f, 0x6, 0x96, 0xdf, 0x4, 0x4d, 0xaf, 0xe6, 0x3d, 0x74, 0xd5, 0x9c, 0x47, 0xe, 0xec, 0xa5, 0x7e, 0x37, 0xa7, 0xee, 0x35, 0x7c, 0x9e, 0xd7, 0xc, 0x45, 0x31, 0x78, 0xa3, 0xea, 0x8, 0x41, 0x9a, 0xd3, 0x43, 0xa, 0xd1, 0x98, 0x7a, 0x33, 0xe8, 0xa1, 0xb7, 0xfe, 0x25, 0x6c, 0x8e, 0xc7, 0x1c, 0x55, 0xc5, 0x8c, 0x57, 0x1e, 0xfc, 0xb5, 0x6e, 0x27, 0x53, 0x1a, 0xc1, 0x88, 0x6a, 0x23, 0xf8, 0xb1, 0x21, 0x68, 0xb3, 0xfa, 0x18, 0x51, 0x8a, 0xc3, 0x62, 0x2b, 0xf0, 0xb9, 0x5b, 0x12, 0xc9, 0x80, 0x10, 0x59, 0x82, 0xcb, 0x29, 0x60, 0xbb, 0xf2, 0x86, 0xcf, 0x14, 0x5d, 0xbf, 0xf6, 0x2d, 0x64, 0xf4, 0xbd, 0x66, 0x2f, 0xcd, 0x84, 0x5f, 0x16, 0x73, 0x3a, 0xe1, 0xa8, 0x4a, 0x3, 0xd8, 0x91, 0x1, 0x48, 0x93, 0xda, 0x38, 0x71, 0xaa, 0xe3, 0x97, 0xde, 0x5, 0x4c, 0xae, 0xe7, 0x3c, 0x75, 0xe5, 0xac, 0x77, 0x3e, 0xdc, 0x95, 0x4e, 0x7, 0xa6, 0xef, 0x34, 0x7d, 0x9f, 0xd6, 0xd, 0x44, 0xd4, 0x9d, 0x46, 0xf, 0xed, 0xa4, 0x7f, 0x36, 0x42, 0xb, 0xd0, 0x99, 0x7b, 0x32, 0xe9, 0xa0, 0x30, 0x79, 0xa2, 0xeb, 0x9, 0x40, 0x9b, 0xd2, 0xc4, 0x8d, 0x56, 0x1f, 0xfd, 0xb4, 0x6f, 0x26, 0xb6, 0xff, 0x24, 0x6d, 0x8f, 0xc6, 0x1d, 0x54, 0x20, 0x69, 0xb2, 0xfb, 0x19, 0x50, 0x8b, 0xc2, 0x52, 0x1b, 0xc0, 0x89, 0x6b, 0x22, 0xf9, 0xb0, 0x11, 0x58, 0x83, 0xca, 0x28, 0x61, 0xba, 0xf3, 0x63, 0x2a, 0xf1, 0xb8, 0x5a, 0x13, 0xc8, 0x81, 0xf5, 0xbc, 0x67, 0x2e, 0xcc, 0x85, 0x5e, 0x17, 0x87, 0xce, 0x15, 0x5c, 0xbe, 0xf7, 0x2c, 0x65},
+ {0x0, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81, 0xd4, 0x9e, 0x40, 0xa, 0xe1, 0xab, 0x75, 0x3f, 0xbe, 0xf4, 0x2a, 0x60, 0x8b, 0xc1, 0x1f, 0x55, 0xb5, 0xff, 0x21, 0x6b, 0x80, 0xca, 0x14, 0x5e, 0xdf, 0x95, 0x4b, 0x1, 0xea, 0xa0, 0x7e, 0x34, 0x61, 0x2b, 0xf5, 0xbf, 0x54, 0x1e, 0xc0, 0x8a, 0xb, 0x41, 0x9f, 0xd5, 0x3e, 0x74, 0xaa, 0xe0, 0x77, 0x3d, 0xe3, 0xa9, 0x42, 0x8, 0xd6, 0x9c, 0x1d, 0x57, 0x89, 0xc3, 0x28, 0x62, 0xbc, 0xf6, 0xa3, 0xe9, 0x37, 0x7d, 0x96, 0xdc, 0x2, 0x48, 0xc9, 0x83, 0x5d, 0x17, 0xfc, 0xb6, 0x68, 0x22, 0xc2, 0x88, 0x56, 0x1c, 0xf7, 0xbd, 0x63, 0x29, 0xa8, 0xe2, 0x3c, 0x76, 0x9d, 0xd7, 0x9, 0x43, 0x16, 0x5c, 0x82, 0xc8, 0x23, 0x69, 0xb7, 0xfd, 0x7c, 0x36, 0xe8, 0xa2, 0x49, 0x3, 0xdd, 0x97, 0xee, 0xa4, 0x7a, 0x30, 0xdb, 0x91, 0x4f, 0x5, 0x84, 0xce, 0x10, 0x5a, 0xb1, 0xfb, 0x25, 0x6f, 0x3a, 0x70, 0xae, 0xe4, 0xf, 0x45, 0x9b, 0xd1, 0x50, 0x1a, 0xc4, 0x8e, 0x65, 0x2f, 0xf1, 0xbb, 0x5b, 0x11, 0xcf, 0x85, 0x6e, 0x24, 0xfa, 0xb0, 0x31, 0x7b, 0xa5, 0xef, 0x4, 0x4e, 0x90, 0xda, 0x8f, 0xc5, 0x1b, 0x51, 0xba, 0xf0, 0x2e, 0x64, 0xe5, 0xaf, 0x71, 0x3b, 0xd0, 0x9a, 0x44, 0xe, 0x99, 0xd3, 0xd, 0x47, 0xac, 0xe6, 0x38, 0x72, 0xf3, 0xb9, 0x67, 0x2d, 0xc6, 0x8c, 0x52, 0x18, 0x4d, 0x7, 0xd9, 0x93, 0x78, 0x32, 0xec, 0xa6, 0x27, 0x6d, 0xb3, 0xf9, 0x12, 0x58, 0x86, 0xcc, 0x2c, 0x66, 0xb8, 0xf2, 0x19, 0x53, 0x8d, 0xc7, 0x46, 0xc, 0xd2, 0x98, 0x73, 0x39, 0xe7, 0xad, 0xf8, 0xb2, 0x6c, 0x26, 0xcd, 0x87, 0x59, 0x13, 0x92, 0xd8, 0x6, 0x4c, 0xa7, 0xed, 0x33, 0x79},
+ {0x0, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e, 0xc4, 0x8f, 0x52, 0x19, 0xf5, 0xbe, 0x63, 0x28, 0xa6, 0xed, 0x30, 0x7b, 0x97, 0xdc, 0x1, 0x4a, 0x95, 0xde, 0x3, 0x48, 0xa4, 0xef, 0x32, 0x79, 0xf7, 0xbc, 0x61, 0x2a, 0xc6, 0x8d, 0x50, 0x1b, 0x51, 0x1a, 0xc7, 0x8c, 0x60, 0x2b, 0xf6, 0xbd, 0x33, 0x78, 0xa5, 0xee, 0x2, 0x49, 0x94, 0xdf, 0x37, 0x7c, 0xa1, 0xea, 0x6, 0x4d, 0x90, 0xdb, 0x55, 0x1e, 0xc3, 0x88, 0x64, 0x2f, 0xf2, 0xb9, 0xf3, 0xb8, 0x65, 0x2e, 0xc2, 0x89, 0x54, 0x1f, 0x91, 0xda, 0x7, 0x4c, 0xa0, 0xeb, 0x36, 0x7d, 0xa2, 0xe9, 0x34, 0x7f, 0x93, 0xd8, 0x5, 0x4e, 0xc0, 0x8b, 0x56, 0x1d, 0xf1, 0xba, 0x67, 0x2c, 0x66, 0x2d, 0xf0, 0xbb, 0x57, 0x1c, 0xc1, 0x8a, 0x4, 0x4f, 0x92, 0xd9, 0x35, 0x7e, 0xa3, 0xe8, 0x6e, 0x25, 0xf8, 0xb3, 0x5f, 0x14, 0xc9, 0x82, 0xc, 0x47, 0x9a, 0xd1, 0x3d, 0x76, 0xab, 0xe0, 0xaa, 0xe1, 0x3c, 0x77, 0x9b, 0xd0, 0xd, 0x46, 0xc8, 0x83, 0x5e, 0x15, 0xf9, 0xb2, 0x6f, 0x24, 0xfb, 0xb0, 0x6d, 0x26, 0xca, 0x81, 0x5c, 0x17, 0x99, 0xd2, 0xf, 0x44, 0xa8, 0xe3, 0x3e, 0x75, 0x3f, 0x74, 0xa9, 0xe2, 0xe, 0x45, 0x98, 0xd3, 0x5d, 0x16, 0xcb, 0x80, 0x6c, 0x27, 0xfa, 0xb1, 0x59, 0x12, 0xcf, 0x84, 0x68, 0x23, 0xfe, 0xb5, 0x3b, 0x70, 0xad, 0xe6, 0xa, 0x41, 0x9c, 0xd7, 0x9d, 0xd6, 0xb, 0x40, 0xac, 0xe7, 0x3a, 0x71, 0xff, 0xb4, 0x69, 0x22, 0xce, 0x85, 0x58, 0x13, 0xcc, 0x87, 0x5a, 0x11, 0xfd, 0xb6, 0x6b, 0x20, 0xae, 0xe5, 0x38, 0x73, 0x9f, 0xd4, 0x9, 0x42, 0x8, 0x43, 0x9e, 0xd5, 0x39, 0x72, 0xaf, 0xe4, 0x6a, 0x21, 0xfc, 0xb7, 0x5b, 0x10, 0xcd, 0x86},
+ {0x0, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3, 0xb4, 0xf8, 0x2c, 0x60, 0x99, 0xd5, 0x1, 0x4d, 0xee, 0xa2, 0x76, 0x3a, 0xc3, 0x8f, 0x5b, 0x17, 0x75, 0x39, 0xed, 0xa1, 0x58, 0x14, 0xc0, 0x8c, 0x2f, 0x63, 0xb7, 0xfb, 0x2, 0x4e, 0x9a, 0xd6, 0xc1, 0x8d, 0x59, 0x15, 0xec, 0xa0, 0x74, 0x38, 0x9b, 0xd7, 0x3, 0x4f, 0xb6, 0xfa, 0x2e, 0x62, 0xea, 0xa6, 0x72, 0x3e, 0xc7, 0x8b, 0x5f, 0x13, 0xb0, 0xfc, 0x28, 0x64, 0x9d, 0xd1, 0x5, 0x49, 0x5e, 0x12, 0xc6, 0x8a, 0x73, 0x3f, 0xeb, 0xa7, 0x4, 0x48, 0x9c, 0xd0, 0x29, 0x65, 0xb1, 0xfd, 0x9f, 0xd3, 0x7, 0x4b, 0xb2, 0xfe, 0x2a, 0x66, 0xc5, 0x89, 0x5d, 0x11, 0xe8, 0xa4, 0x70, 0x3c, 0x2b, 0x67, 0xb3, 0xff, 0x6, 0x4a, 0x9e, 0xd2, 0x71, 0x3d, 0xe9, 0xa5, 0x5c, 0x10, 0xc4, 0x88, 0xc9, 0x85, 0x51, 0x1d, 0xe4, 0xa8, 0x7c, 0x30, 0x93, 0xdf, 0xb, 0x47, 0xbe, 0xf2, 0x26, 0x6a, 0x7d, 0x31, 0xe5, 0xa9, 0x50, 0x1c, 0xc8, 0x84, 0x27, 0x6b, 0xbf, 0xf3, 0xa, 0x46, 0x92, 0xde, 0xbc, 0xf0, 0x24, 0x68, 0x91, 0xdd, 0x9, 0x45, 0xe6, 0xaa, 0x7e, 0x32, 0xcb, 0x87, 0x53, 0x1f, 0x8, 0x44, 0x90, 0xdc, 0x25, 0x69, 0xbd, 0xf1, 0x52, 0x1e, 0xca, 0x86, 0x7f, 0x33, 0xe7, 0xab, 0x23, 0x6f, 0xbb, 0xf7, 0xe, 0x42, 0x96, 0xda, 0x79, 0x35, 0xe1, 0xad, 0x54, 0x18, 0xcc, 0x80, 0x97, 0xdb, 0xf, 0x43, 0xba, 0xf6, 0x22, 0x6e, 0xcd, 0x81, 0x55, 0x19, 0xe0, 0xac, 0x78, 0x34, 0x56, 0x1a, 0xce, 0x82, 0x7b, 0x37, 0xe3, 0xaf, 0xc, 0x40, 0x94, 0xd8, 0x21, 0x6d, 0xb9, 0xf5, 0xe2, 0xae, 0x7a, 0x36, 0xcf, 0x83, 0x57, 0x1b, 0xb8, 0xf4, 0x20, 0x6c, 0x95, 0xd9, 0xd, 0x41},
+ {0x0, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac, 0xa4, 0xe9, 0x3e, 0x73, 0x8d, 0xc0, 0x17, 0x5a, 0xf6, 0xbb, 0x6c, 0x21, 0xdf, 0x92, 0x45, 0x8, 0x55, 0x18, 0xcf, 0x82, 0x7c, 0x31, 0xe6, 0xab, 0x7, 0x4a, 0x9d, 0xd0, 0x2e, 0x63, 0xb4, 0xf9, 0xf1, 0xbc, 0x6b, 0x26, 0xd8, 0x95, 0x42, 0xf, 0xa3, 0xee, 0x39, 0x74, 0x8a, 0xc7, 0x10, 0x5d, 0xaa, 0xe7, 0x30, 0x7d, 0x83, 0xce, 0x19, 0x54, 0xf8, 0xb5, 0x62, 0x2f, 0xd1, 0x9c, 0x4b, 0x6, 0xe, 0x43, 0x94, 0xd9, 0x27, 0x6a, 0xbd, 0xf0, 0x5c, 0x11, 0xc6, 0x8b, 0x75, 0x38, 0xef, 0xa2, 0xff, 0xb2, 0x65, 0x28, 0xd6, 0x9b, 0x4c, 0x1, 0xad, 0xe0, 0x37, 0x7a, 0x84, 0xc9, 0x1e, 0x53, 0x5b, 0x16, 0xc1, 0x8c, 0x72, 0x3f, 0xe8, 0xa5, 0x9, 0x44, 0x93, 0xde, 0x20, 0x6d, 0xba, 0xf7, 0x49, 0x4, 0xd3, 0x9e, 0x60, 0x2d, 0xfa, 0xb7, 0x1b, 0x56, 0x81, 0xcc, 0x32, 0x7f, 0xa8, 0xe5, 0xed, 0xa0, 0x77, 0x3a, 0xc4, 0x89, 0x5e, 0x13, 0xbf, 0xf2, 0x25, 0x68, 0x96, 0xdb, 0xc, 0x41, 0x1c, 0x51, 0x86, 0xcb, 0x35, 0x78, 0xaf, 0xe2, 0x4e, 0x3, 0xd4, 0x99, 0x67, 0x2a, 0xfd, 0xb0, 0xb8, 0xf5, 0x22, 0x6f, 0x91, 0xdc, 0xb, 0x46, 0xea, 0xa7, 0x70, 0x3d, 0xc3, 0x8e, 0x59, 0x14, 0xe3, 0xae, 0x79, 0x34, 0xca, 0x87, 0x50, 0x1d, 0xb1, 0xfc, 0x2b, 0x66, 0x98, 0xd5, 0x2, 0x4f, 0x47, 0xa, 0xdd, 0x90, 0x6e, 0x23, 0xf4, 0xb9, 0x15, 0x58, 0x8f, 0xc2, 0x3c, 0x71, 0xa6, 0xeb, 0xb6, 0xfb, 0x2c, 0x61, 0x9f, 0xd2, 0x5, 0x48, 0xe4, 0xa9, 0x7e, 0x33, 0xcd, 0x80, 0x57, 0x1a, 0x12, 0x5f, 0x88, 0xc5, 0x3b, 0x76, 0xa1, 0xec, 0x40, 0xd, 0xda, 0x97, 0x69, 0x24, 0xf3, 0xbe},
+ {0x0, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x4, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd, 0x94, 0xda, 0x8, 0x46, 0xb1, 0xff, 0x2d, 0x63, 0xde, 0x90, 0x42, 0xc, 0xfb, 0xb5, 0x67, 0x29, 0x35, 0x7b, 0xa9, 0xe7, 0x10, 0x5e, 0x8c, 0xc2, 0x7f, 0x31, 0xe3, 0xad, 0x5a, 0x14, 0xc6, 0x88, 0xa1, 0xef, 0x3d, 0x73, 0x84, 0xca, 0x18, 0x56, 0xeb, 0xa5, 0x77, 0x39, 0xce, 0x80, 0x52, 0x1c, 0x6a, 0x24, 0xf6, 0xb8, 0x4f, 0x1, 0xd3, 0x9d, 0x20, 0x6e, 0xbc, 0xf2, 0x5, 0x4b, 0x99, 0xd7, 0xfe, 0xb0, 0x62, 0x2c, 0xdb, 0x95, 0x47, 0x9, 0xb4, 0xfa, 0x28, 0x66, 0x91, 0xdf, 0xd, 0x43, 0x5f, 0x11, 0xc3, 0x8d, 0x7a, 0x34, 0xe6, 0xa8, 0x15, 0x5b, 0x89, 0xc7, 0x30, 0x7e, 0xac, 0xe2, 0xcb, 0x85, 0x57, 0x19, 0xee, 0xa0, 0x72, 0x3c, 0x81, 0xcf, 0x1d, 0x53, 0xa4, 0xea, 0x38, 0x76, 0xd4, 0x9a, 0x48, 0x6, 0xf1, 0xbf, 0x6d, 0x23, 0x9e, 0xd0, 0x2, 0x4c, 0xbb, 0xf5, 0x27, 0x69, 0x40, 0xe, 0xdc, 0x92, 0x65, 0x2b, 0xf9, 0xb7, 0xa, 0x44, 0x96, 0xd8, 0x2f, 0x61, 0xb3, 0xfd, 0xe1, 0xaf, 0x7d, 0x33, 0xc4, 0x8a, 0x58, 0x16, 0xab, 0xe5, 0x37, 0x79, 0x8e, 0xc0, 0x12, 0x5c, 0x75, 0x3b, 0xe9, 0xa7, 0x50, 0x1e, 0xcc, 0x82, 0x3f, 0x71, 0xa3, 0xed, 0x1a, 0x54, 0x86, 0xc8, 0xbe, 0xf0, 0x22, 0x6c, 0x9b, 0xd5, 0x7, 0x49, 0xf4, 0xba, 0x68, 0x26, 0xd1, 0x9f, 0x4d, 0x3, 0x2a, 0x64, 0xb6, 0xf8, 0xf, 0x41, 0x93, 0xdd, 0x60, 0x2e, 0xfc, 0xb2, 0x45, 0xb, 0xd9, 0x97, 0x8b, 0xc5, 0x17, 0x59, 0xae, 0xe0, 0x32, 0x7c, 0xc1, 0x8f, 0x5d, 0x13, 0xe4, 0xaa, 0x78, 0x36, 0x1f, 0x51, 0x83, 0xcd, 0x3a, 0x74, 0xa6, 0xe8, 0x55, 0x1b, 0xc9, 0x87, 0x70, 0x3e, 0xec, 0xa2},
+ {0x0, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0xd, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2, 0x84, 0xcb, 0x1a, 0x55, 0xa5, 0xea, 0x3b, 0x74, 0xc6, 0x89, 0x58, 0x17, 0xe7, 0xa8, 0x79, 0x36, 0x15, 0x5a, 0x8b, 0xc4, 0x34, 0x7b, 0xaa, 0xe5, 0x57, 0x18, 0xc9, 0x86, 0x76, 0x39, 0xe8, 0xa7, 0x91, 0xde, 0xf, 0x40, 0xb0, 0xff, 0x2e, 0x61, 0xd3, 0x9c, 0x4d, 0x2, 0xf2, 0xbd, 0x6c, 0x23, 0x2a, 0x65, 0xb4, 0xfb, 0xb, 0x44, 0x95, 0xda, 0x68, 0x27, 0xf6, 0xb9, 0x49, 0x6, 0xd7, 0x98, 0xae, 0xe1, 0x30, 0x7f, 0x8f, 0xc0, 0x11, 0x5e, 0xec, 0xa3, 0x72, 0x3d, 0xcd, 0x82, 0x53, 0x1c, 0x3f, 0x70, 0xa1, 0xee, 0x1e, 0x51, 0x80, 0xcf, 0x7d, 0x32, 0xe3, 0xac, 0x5c, 0x13, 0xc2, 0x8d, 0xbb, 0xf4, 0x25, 0x6a, 0x9a, 0xd5, 0x4, 0x4b, 0xf9, 0xb6, 0x67, 0x28, 0xd8, 0x97, 0x46, 0x9, 0x54, 0x1b, 0xca, 0x85, 0x75, 0x3a, 0xeb, 0xa4, 0x16, 0x59, 0x88, 0xc7, 0x37, 0x78, 0xa9, 0xe6, 0xd0, 0x9f, 0x4e, 0x1, 0xf1, 0xbe, 0x6f, 0x20, 0x92, 0xdd, 0xc, 0x43, 0xb3, 0xfc, 0x2d, 0x62, 0x41, 0xe, 0xdf, 0x90, 0x60, 0x2f, 0xfe, 0xb1, 0x3, 0x4c, 0x9d, 0xd2, 0x22, 0x6d, 0xbc, 0xf3, 0xc5, 0x8a, 0x5b, 0x14, 0xe4, 0xab, 0x7a, 0x35, 0x87, 0xc8, 0x19, 0x56, 0xa6, 0xe9, 0x38, 0x77, 0x7e, 0x31, 0xe0, 0xaf, 0x5f, 0x10, 0xc1, 0x8e, 0x3c, 0x73, 0xa2, 0xed, 0x1d, 0x52, 0x83, 0xcc, 0xfa, 0xb5, 0x64, 0x2b, 0xdb, 0x94, 0x45, 0xa, 0xb8, 0xf7, 0x26, 0x69, 0x99, 0xd6, 0x7, 0x48, 0x6b, 0x24, 0xf5, 0xba, 0x4a, 0x5, 0xd4, 0x9b, 0x29, 0x66, 0xb7, 0xf8, 0x8, 0x47, 0x96, 0xd9, 0xef, 0xa0, 0x71, 0x3e, 0xce, 0x81, 0x50, 0x1f, 0xad, 0xe2, 0x33, 0x7c, 0x8c, 0xc3, 0x12, 0x5d},
+ {0x0, 0x50, 0xa0, 0xf0, 0x5d, 0xd, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17, 0x69, 0x39, 0xc9, 0x99, 0x34, 0x64, 0x94, 0xc4, 0xd3, 0x83, 0x73, 0x23, 0x8e, 0xde, 0x2e, 0x7e, 0xd2, 0x82, 0x72, 0x22, 0x8f, 0xdf, 0x2f, 0x7f, 0x68, 0x38, 0xc8, 0x98, 0x35, 0x65, 0x95, 0xc5, 0xbb, 0xeb, 0x1b, 0x4b, 0xe6, 0xb6, 0x46, 0x16, 0x1, 0x51, 0xa1, 0xf1, 0x5c, 0xc, 0xfc, 0xac, 0xb9, 0xe9, 0x19, 0x49, 0xe4, 0xb4, 0x44, 0x14, 0x3, 0x53, 0xa3, 0xf3, 0x5e, 0xe, 0xfe, 0xae, 0xd0, 0x80, 0x70, 0x20, 0x8d, 0xdd, 0x2d, 0x7d, 0x6a, 0x3a, 0xca, 0x9a, 0x37, 0x67, 0x97, 0xc7, 0x6b, 0x3b, 0xcb, 0x9b, 0x36, 0x66, 0x96, 0xc6, 0xd1, 0x81, 0x71, 0x21, 0x8c, 0xdc, 0x2c, 0x7c, 0x2, 0x52, 0xa2, 0xf2, 0x5f, 0xf, 0xff, 0xaf, 0xb8, 0xe8, 0x18, 0x48, 0xe5, 0xb5, 0x45, 0x15, 0x6f, 0x3f, 0xcf, 0x9f, 0x32, 0x62, 0x92, 0xc2, 0xd5, 0x85, 0x75, 0x25, 0x88, 0xd8, 0x28, 0x78, 0x6, 0x56, 0xa6, 0xf6, 0x5b, 0xb, 0xfb, 0xab, 0xbc, 0xec, 0x1c, 0x4c, 0xe1, 0xb1, 0x41, 0x11, 0xbd, 0xed, 0x1d, 0x4d, 0xe0, 0xb0, 0x40, 0x10, 0x7, 0x57, 0xa7, 0xf7, 0x5a, 0xa, 0xfa, 0xaa, 0xd4, 0x84, 0x74, 0x24, 0x89, 0xd9, 0x29, 0x79, 0x6e, 0x3e, 0xce, 0x9e, 0x33, 0x63, 0x93, 0xc3, 0xd6, 0x86, 0x76, 0x26, 0x8b, 0xdb, 0x2b, 0x7b, 0x6c, 0x3c, 0xcc, 0x9c, 0x31, 0x61, 0x91, 0xc1, 0xbf, 0xef, 0x1f, 0x4f, 0xe2, 0xb2, 0x42, 0x12, 0x5, 0x55, 0xa5, 0xf5, 0x58, 0x8, 0xf8, 0xa8, 0x4, 0x54, 0xa4, 0xf4, 0x59, 0x9, 0xf9, 0xa9, 0xbe, 0xee, 0x1e, 0x4e, 0xe3, 0xb3, 0x43, 0x13, 0x6d, 0x3d, 0xcd, 0x9d, 0x30, 0x60, 0x90, 0xc0, 0xd7, 0x87, 0x77, 0x27, 0x8a, 0xda, 0x2a, 0x7a},
+ {0x0, 0x51, 0xa2, 0xf3, 0x59, 0x8, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18, 0x79, 0x28, 0xdb, 0x8a, 0x20, 0x71, 0x82, 0xd3, 0xcb, 0x9a, 0x69, 0x38, 0x92, 0xc3, 0x30, 0x61, 0xf2, 0xa3, 0x50, 0x1, 0xab, 0xfa, 0x9, 0x58, 0x40, 0x11, 0xe2, 0xb3, 0x19, 0x48, 0xbb, 0xea, 0x8b, 0xda, 0x29, 0x78, 0xd2, 0x83, 0x70, 0x21, 0x39, 0x68, 0x9b, 0xca, 0x60, 0x31, 0xc2, 0x93, 0xf9, 0xa8, 0x5b, 0xa, 0xa0, 0xf1, 0x2, 0x53, 0x4b, 0x1a, 0xe9, 0xb8, 0x12, 0x43, 0xb0, 0xe1, 0x80, 0xd1, 0x22, 0x73, 0xd9, 0x88, 0x7b, 0x2a, 0x32, 0x63, 0x90, 0xc1, 0x6b, 0x3a, 0xc9, 0x98, 0xb, 0x5a, 0xa9, 0xf8, 0x52, 0x3, 0xf0, 0xa1, 0xb9, 0xe8, 0x1b, 0x4a, 0xe0, 0xb1, 0x42, 0x13, 0x72, 0x23, 0xd0, 0x81, 0x2b, 0x7a, 0x89, 0xd8, 0xc0, 0x91, 0x62, 0x33, 0x99, 0xc8, 0x3b, 0x6a, 0xef, 0xbe, 0x4d, 0x1c, 0xb6, 0xe7, 0x14, 0x45, 0x5d, 0xc, 0xff, 0xae, 0x4, 0x55, 0xa6, 0xf7, 0x96, 0xc7, 0x34, 0x65, 0xcf, 0x9e, 0x6d, 0x3c, 0x24, 0x75, 0x86, 0xd7, 0x7d, 0x2c, 0xdf, 0x8e, 0x1d, 0x4c, 0xbf, 0xee, 0x44, 0x15, 0xe6, 0xb7, 0xaf, 0xfe, 0xd, 0x5c, 0xf6, 0xa7, 0x54, 0x5, 0x64, 0x35, 0xc6, 0x97, 0x3d, 0x6c, 0x9f, 0xce, 0xd6, 0x87, 0x74, 0x25, 0x8f, 0xde, 0x2d, 0x7c, 0x16, 0x47, 0xb4, 0xe5, 0x4f, 0x1e, 0xed, 0xbc, 0xa4, 0xf5, 0x6, 0x57, 0xfd, 0xac, 0x5f, 0xe, 0x6f, 0x3e, 0xcd, 0x9c, 0x36, 0x67, 0x94, 0xc5, 0xdd, 0x8c, 0x7f, 0x2e, 0x84, 0xd5, 0x26, 0x77, 0xe4, 0xb5, 0x46, 0x17, 0xbd, 0xec, 0x1f, 0x4e, 0x56, 0x7, 0xf4, 0xa5, 0xf, 0x5e, 0xad, 0xfc, 0x9d, 0xcc, 0x3f, 0x6e, 0xc4, 0x95, 0x66, 0x37, 0x2f, 0x7e, 0x8d, 0xdc, 0x76, 0x27, 0xd4, 0x85},
+ {0x0, 0x52, 0xa4, 0xf6, 0x55, 0x7, 0xf1, 0xa3, 0xaa, 0xf8, 0xe, 0x5c, 0xff, 0xad, 0x5b, 0x9, 0x49, 0x1b, 0xed, 0xbf, 0x1c, 0x4e, 0xb8, 0xea, 0xe3, 0xb1, 0x47, 0x15, 0xb6, 0xe4, 0x12, 0x40, 0x92, 0xc0, 0x36, 0x64, 0xc7, 0x95, 0x63, 0x31, 0x38, 0x6a, 0x9c, 0xce, 0x6d, 0x3f, 0xc9, 0x9b, 0xdb, 0x89, 0x7f, 0x2d, 0x8e, 0xdc, 0x2a, 0x78, 0x71, 0x23, 0xd5, 0x87, 0x24, 0x76, 0x80, 0xd2, 0x39, 0x6b, 0x9d, 0xcf, 0x6c, 0x3e, 0xc8, 0x9a, 0x93, 0xc1, 0x37, 0x65, 0xc6, 0x94, 0x62, 0x30, 0x70, 0x22, 0xd4, 0x86, 0x25, 0x77, 0x81, 0xd3, 0xda, 0x88, 0x7e, 0x2c, 0x8f, 0xdd, 0x2b, 0x79, 0xab, 0xf9, 0xf, 0x5d, 0xfe, 0xac, 0x5a, 0x8, 0x1, 0x53, 0xa5, 0xf7, 0x54, 0x6, 0xf0, 0xa2, 0xe2, 0xb0, 0x46, 0x14, 0xb7, 0xe5, 0x13, 0x41, 0x48, 0x1a, 0xec, 0xbe, 0x1d, 0x4f, 0xb9, 0xeb, 0x72, 0x20, 0xd6, 0x84, 0x27, 0x75, 0x83, 0xd1, 0xd8, 0x8a, 0x7c, 0x2e, 0x8d, 0xdf, 0x29, 0x7b, 0x3b, 0x69, 0x9f, 0xcd, 0x6e, 0x3c, 0xca, 0x98, 0x91, 0xc3, 0x35, 0x67, 0xc4, 0x96, 0x60, 0x32, 0xe0, 0xb2, 0x44, 0x16, 0xb5, 0xe7, 0x11, 0x43, 0x4a, 0x18, 0xee, 0xbc, 0x1f, 0x4d, 0xbb, 0xe9, 0xa9, 0xfb, 0xd, 0x5f, 0xfc, 0xae, 0x58, 0xa, 0x3, 0x51, 0xa7, 0xf5, 0x56, 0x4, 0xf2, 0xa0, 0x4b, 0x19, 0xef, 0xbd, 0x1e, 0x4c, 0xba, 0xe8, 0xe1, 0xb3, 0x45, 0x17, 0xb4, 0xe6, 0x10, 0x42, 0x2, 0x50, 0xa6, 0xf4, 0x57, 0x5, 0xf3, 0xa1, 0xa8, 0xfa, 0xc, 0x5e, 0xfd, 0xaf, 0x59, 0xb, 0xd9, 0x8b, 0x7d, 0x2f, 0x8c, 0xde, 0x28, 0x7a, 0x73, 0x21, 0xd7, 0x85, 0x26, 0x74, 0x82, 0xd0, 0x90, 0xc2, 0x34, 0x66, 0xc5, 0x97, 0x61, 0x33, 0x3a, 0x68, 0x9e, 0xcc, 0x6f, 0x3d, 0xcb, 0x99},
+ {0x0, 0x53, 0xa6, 0xf5, 0x51, 0x2, 0xf7, 0xa4, 0xa2, 0xf1, 0x4, 0x57, 0xf3, 0xa0, 0x55, 0x6, 0x59, 0xa, 0xff, 0xac, 0x8, 0x5b, 0xae, 0xfd, 0xfb, 0xa8, 0x5d, 0xe, 0xaa, 0xf9, 0xc, 0x5f, 0xb2, 0xe1, 0x14, 0x47, 0xe3, 0xb0, 0x45, 0x16, 0x10, 0x43, 0xb6, 0xe5, 0x41, 0x12, 0xe7, 0xb4, 0xeb, 0xb8, 0x4d, 0x1e, 0xba, 0xe9, 0x1c, 0x4f, 0x49, 0x1a, 0xef, 0xbc, 0x18, 0x4b, 0xbe, 0xed, 0x79, 0x2a, 0xdf, 0x8c, 0x28, 0x7b, 0x8e, 0xdd, 0xdb, 0x88, 0x7d, 0x2e, 0x8a, 0xd9, 0x2c, 0x7f, 0x20, 0x73, 0x86, 0xd5, 0x71, 0x22, 0xd7, 0x84, 0x82, 0xd1, 0x24, 0x77, 0xd3, 0x80, 0x75, 0x26, 0xcb, 0x98, 0x6d, 0x3e, 0x9a, 0xc9, 0x3c, 0x6f, 0x69, 0x3a, 0xcf, 0x9c, 0x38, 0x6b, 0x9e, 0xcd, 0x92, 0xc1, 0x34, 0x67, 0xc3, 0x90, 0x65, 0x36, 0x30, 0x63, 0x96, 0xc5, 0x61, 0x32, 0xc7, 0x94, 0xf2, 0xa1, 0x54, 0x7, 0xa3, 0xf0, 0x5, 0x56, 0x50, 0x3, 0xf6, 0xa5, 0x1, 0x52, 0xa7, 0xf4, 0xab, 0xf8, 0xd, 0x5e, 0xfa, 0xa9, 0x5c, 0xf, 0x9, 0x5a, 0xaf, 0xfc, 0x58, 0xb, 0xfe, 0xad, 0x40, 0x13, 0xe6, 0xb5, 0x11, 0x42, 0xb7, 0xe4, 0xe2, 0xb1, 0x44, 0x17, 0xb3, 0xe0, 0x15, 0x46, 0x19, 0x4a, 0xbf, 0xec, 0x48, 0x1b, 0xee, 0xbd, 0xbb, 0xe8, 0x1d, 0x4e, 0xea, 0xb9, 0x4c, 0x1f, 0x8b, 0xd8, 0x2d, 0x7e, 0xda, 0x89, 0x7c, 0x2f, 0x29, 0x7a, 0x8f, 0xdc, 0x78, 0x2b, 0xde, 0x8d, 0xd2, 0x81, 0x74, 0x27, 0x83, 0xd0, 0x25, 0x76, 0x70, 0x23, 0xd6, 0x85, 0x21, 0x72, 0x87, 0xd4, 0x39, 0x6a, 0x9f, 0xcc, 0x68, 0x3b, 0xce, 0x9d, 0x9b, 0xc8, 0x3d, 0x6e, 0xca, 0x99, 0x6c, 0x3f, 0x60, 0x33, 0xc6, 0x95, 0x31, 0x62, 0x97, 0xc4, 0xc2, 0x91, 0x64, 0x37, 0x93, 0xc0, 0x35, 0x66},
+ {0x0, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b, 0x29, 0x7d, 0x81, 0xd5, 0x64, 0x30, 0xcc, 0x98, 0xb3, 0xe7, 0x1b, 0x4f, 0xfe, 0xaa, 0x56, 0x2, 0x52, 0x6, 0xfa, 0xae, 0x1f, 0x4b, 0xb7, 0xe3, 0xc8, 0x9c, 0x60, 0x34, 0x85, 0xd1, 0x2d, 0x79, 0x7b, 0x2f, 0xd3, 0x87, 0x36, 0x62, 0x9e, 0xca, 0xe1, 0xb5, 0x49, 0x1d, 0xac, 0xf8, 0x4, 0x50, 0xa4, 0xf0, 0xc, 0x58, 0xe9, 0xbd, 0x41, 0x15, 0x3e, 0x6a, 0x96, 0xc2, 0x73, 0x27, 0xdb, 0x8f, 0x8d, 0xd9, 0x25, 0x71, 0xc0, 0x94, 0x68, 0x3c, 0x17, 0x43, 0xbf, 0xeb, 0x5a, 0xe, 0xf2, 0xa6, 0xf6, 0xa2, 0x5e, 0xa, 0xbb, 0xef, 0x13, 0x47, 0x6c, 0x38, 0xc4, 0x90, 0x21, 0x75, 0x89, 0xdd, 0xdf, 0x8b, 0x77, 0x23, 0x92, 0xc6, 0x3a, 0x6e, 0x45, 0x11, 0xed, 0xb9, 0x8, 0x5c, 0xa0, 0xf4, 0x55, 0x1, 0xfd, 0xa9, 0x18, 0x4c, 0xb0, 0xe4, 0xcf, 0x9b, 0x67, 0x33, 0x82, 0xd6, 0x2a, 0x7e, 0x7c, 0x28, 0xd4, 0x80, 0x31, 0x65, 0x99, 0xcd, 0xe6, 0xb2, 0x4e, 0x1a, 0xab, 0xff, 0x3, 0x57, 0x7, 0x53, 0xaf, 0xfb, 0x4a, 0x1e, 0xe2, 0xb6, 0x9d, 0xc9, 0x35, 0x61, 0xd0, 0x84, 0x78, 0x2c, 0x2e, 0x7a, 0x86, 0xd2, 0x63, 0x37, 0xcb, 0x9f, 0xb4, 0xe0, 0x1c, 0x48, 0xf9, 0xad, 0x51, 0x5, 0xf1, 0xa5, 0x59, 0xd, 0xbc, 0xe8, 0x14, 0x40, 0x6b, 0x3f, 0xc3, 0x97, 0x26, 0x72, 0x8e, 0xda, 0xd8, 0x8c, 0x70, 0x24, 0x95, 0xc1, 0x3d, 0x69, 0x42, 0x16, 0xea, 0xbe, 0xf, 0x5b, 0xa7, 0xf3, 0xa3, 0xf7, 0xb, 0x5f, 0xee, 0xba, 0x46, 0x12, 0x39, 0x6d, 0x91, 0xc5, 0x74, 0x20, 0xdc, 0x88, 0x8a, 0xde, 0x22, 0x76, 0xc7, 0x93, 0x6f, 0x3b, 0x10, 0x44, 0xb8, 0xec, 0x5d, 0x9, 0xf5, 0xa1},
+ {0x0, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24, 0x39, 0x6c, 0x93, 0xc6, 0x70, 0x25, 0xda, 0x8f, 0xab, 0xfe, 0x1, 0x54, 0xe2, 0xb7, 0x48, 0x1d, 0x72, 0x27, 0xd8, 0x8d, 0x3b, 0x6e, 0x91, 0xc4, 0xe0, 0xb5, 0x4a, 0x1f, 0xa9, 0xfc, 0x3, 0x56, 0x4b, 0x1e, 0xe1, 0xb4, 0x2, 0x57, 0xa8, 0xfd, 0xd9, 0x8c, 0x73, 0x26, 0x90, 0xc5, 0x3a, 0x6f, 0xe4, 0xb1, 0x4e, 0x1b, 0xad, 0xf8, 0x7, 0x52, 0x76, 0x23, 0xdc, 0x89, 0x3f, 0x6a, 0x95, 0xc0, 0xdd, 0x88, 0x77, 0x22, 0x94, 0xc1, 0x3e, 0x6b, 0x4f, 0x1a, 0xe5, 0xb0, 0x6, 0x53, 0xac, 0xf9, 0x96, 0xc3, 0x3c, 0x69, 0xdf, 0x8a, 0x75, 0x20, 0x4, 0x51, 0xae, 0xfb, 0x4d, 0x18, 0xe7, 0xb2, 0xaf, 0xfa, 0x5, 0x50, 0xe6, 0xb3, 0x4c, 0x19, 0x3d, 0x68, 0x97, 0xc2, 0x74, 0x21, 0xde, 0x8b, 0xd5, 0x80, 0x7f, 0x2a, 0x9c, 0xc9, 0x36, 0x63, 0x47, 0x12, 0xed, 0xb8, 0xe, 0x5b, 0xa4, 0xf1, 0xec, 0xb9, 0x46, 0x13, 0xa5, 0xf0, 0xf, 0x5a, 0x7e, 0x2b, 0xd4, 0x81, 0x37, 0x62, 0x9d, 0xc8, 0xa7, 0xf2, 0xd, 0x58, 0xee, 0xbb, 0x44, 0x11, 0x35, 0x60, 0x9f, 0xca, 0x7c, 0x29, 0xd6, 0x83, 0x9e, 0xcb, 0x34, 0x61, 0xd7, 0x82, 0x7d, 0x28, 0xc, 0x59, 0xa6, 0xf3, 0x45, 0x10, 0xef, 0xba, 0x31, 0x64, 0x9b, 0xce, 0x78, 0x2d, 0xd2, 0x87, 0xa3, 0xf6, 0x9, 0x5c, 0xea, 0xbf, 0x40, 0x15, 0x8, 0x5d, 0xa2, 0xf7, 0x41, 0x14, 0xeb, 0xbe, 0x9a, 0xcf, 0x30, 0x65, 0xd3, 0x86, 0x79, 0x2c, 0x43, 0x16, 0xe9, 0xbc, 0xa, 0x5f, 0xa0, 0xf5, 0xd1, 0x84, 0x7b, 0x2e, 0x98, 0xcd, 0x32, 0x67, 0x7a, 0x2f, 0xd0, 0x85, 0x33, 0x66, 0x99, 0xcc, 0xe8, 0xbd, 0x42, 0x17, 0xa1, 0xf4, 0xb, 0x5e},
+ {0x0, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35, 0x9, 0x5f, 0xa5, 0xf3, 0x4c, 0x1a, 0xe0, 0xb6, 0x83, 0xd5, 0x2f, 0x79, 0xc6, 0x90, 0x6a, 0x3c, 0x12, 0x44, 0xbe, 0xe8, 0x57, 0x1, 0xfb, 0xad, 0x98, 0xce, 0x34, 0x62, 0xdd, 0x8b, 0x71, 0x27, 0x1b, 0x4d, 0xb7, 0xe1, 0x5e, 0x8, 0xf2, 0xa4, 0x91, 0xc7, 0x3d, 0x6b, 0xd4, 0x82, 0x78, 0x2e, 0x24, 0x72, 0x88, 0xde, 0x61, 0x37, 0xcd, 0x9b, 0xae, 0xf8, 0x2, 0x54, 0xeb, 0xbd, 0x47, 0x11, 0x2d, 0x7b, 0x81, 0xd7, 0x68, 0x3e, 0xc4, 0x92, 0xa7, 0xf1, 0xb, 0x5d, 0xe2, 0xb4, 0x4e, 0x18, 0x36, 0x60, 0x9a, 0xcc, 0x73, 0x25, 0xdf, 0x89, 0xbc, 0xea, 0x10, 0x46, 0xf9, 0xaf, 0x55, 0x3, 0x3f, 0x69, 0x93, 0xc5, 0x7a, 0x2c, 0xd6, 0x80, 0xb5, 0xe3, 0x19, 0x4f, 0xf0, 0xa6, 0x5c, 0xa, 0x48, 0x1e, 0xe4, 0xb2, 0xd, 0x5b, 0xa1, 0xf7, 0xc2, 0x94, 0x6e, 0x38, 0x87, 0xd1, 0x2b, 0x7d, 0x41, 0x17, 0xed, 0xbb, 0x4, 0x52, 0xa8, 0xfe, 0xcb, 0x9d, 0x67, 0x31, 0x8e, 0xd8, 0x22, 0x74, 0x5a, 0xc, 0xf6, 0xa0, 0x1f, 0x49, 0xb3, 0xe5, 0xd0, 0x86, 0x7c, 0x2a, 0x95, 0xc3, 0x39, 0x6f, 0x53, 0x5, 0xff, 0xa9, 0x16, 0x40, 0xba, 0xec, 0xd9, 0x8f, 0x75, 0x23, 0x9c, 0xca, 0x30, 0x66, 0x6c, 0x3a, 0xc0, 0x96, 0x29, 0x7f, 0x85, 0xd3, 0xe6, 0xb0, 0x4a, 0x1c, 0xa3, 0xf5, 0xf, 0x59, 0x65, 0x33, 0xc9, 0x9f, 0x20, 0x76, 0x8c, 0xda, 0xef, 0xb9, 0x43, 0x15, 0xaa, 0xfc, 0x6, 0x50, 0x7e, 0x28, 0xd2, 0x84, 0x3b, 0x6d, 0x97, 0xc1, 0xf4, 0xa2, 0x58, 0xe, 0xb1, 0xe7, 0x1d, 0x4b, 0x77, 0x21, 0xdb, 0x8d, 0x32, 0x64, 0x9e, 0xc8, 0xfd, 0xab, 0x51, 0x7, 0xb8, 0xee, 0x14, 0x42},
+ {0x0, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a, 0x19, 0x4e, 0xb7, 0xe0, 0x58, 0xf, 0xf6, 0xa1, 0x9b, 0xcc, 0x35, 0x62, 0xda, 0x8d, 0x74, 0x23, 0x32, 0x65, 0x9c, 0xcb, 0x73, 0x24, 0xdd, 0x8a, 0xb0, 0xe7, 0x1e, 0x49, 0xf1, 0xa6, 0x5f, 0x8, 0x2b, 0x7c, 0x85, 0xd2, 0x6a, 0x3d, 0xc4, 0x93, 0xa9, 0xfe, 0x7, 0x50, 0xe8, 0xbf, 0x46, 0x11, 0x64, 0x33, 0xca, 0x9d, 0x25, 0x72, 0x8b, 0xdc, 0xe6, 0xb1, 0x48, 0x1f, 0xa7, 0xf0, 0x9, 0x5e, 0x7d, 0x2a, 0xd3, 0x84, 0x3c, 0x6b, 0x92, 0xc5, 0xff, 0xa8, 0x51, 0x6, 0xbe, 0xe9, 0x10, 0x47, 0x56, 0x1, 0xf8, 0xaf, 0x17, 0x40, 0xb9, 0xee, 0xd4, 0x83, 0x7a, 0x2d, 0x95, 0xc2, 0x3b, 0x6c, 0x4f, 0x18, 0xe1, 0xb6, 0xe, 0x59, 0xa0, 0xf7, 0xcd, 0x9a, 0x63, 0x34, 0x8c, 0xdb, 0x22, 0x75, 0xc8, 0x9f, 0x66, 0x31, 0x89, 0xde, 0x27, 0x70, 0x4a, 0x1d, 0xe4, 0xb3, 0xb, 0x5c, 0xa5, 0xf2, 0xd1, 0x86, 0x7f, 0x28, 0x90, 0xc7, 0x3e, 0x69, 0x53, 0x4, 0xfd, 0xaa, 0x12, 0x45, 0xbc, 0xeb, 0xfa, 0xad, 0x54, 0x3, 0xbb, 0xec, 0x15, 0x42, 0x78, 0x2f, 0xd6, 0x81, 0x39, 0x6e, 0x97, 0xc0, 0xe3, 0xb4, 0x4d, 0x1a, 0xa2, 0xf5, 0xc, 0x5b, 0x61, 0x36, 0xcf, 0x98, 0x20, 0x77, 0x8e, 0xd9, 0xac, 0xfb, 0x2, 0x55, 0xed, 0xba, 0x43, 0x14, 0x2e, 0x79, 0x80, 0xd7, 0x6f, 0x38, 0xc1, 0x96, 0xb5, 0xe2, 0x1b, 0x4c, 0xf4, 0xa3, 0x5a, 0xd, 0x37, 0x60, 0x99, 0xce, 0x76, 0x21, 0xd8, 0x8f, 0x9e, 0xc9, 0x30, 0x67, 0xdf, 0x88, 0x71, 0x26, 0x1c, 0x4b, 0xb2, 0xe5, 0x5d, 0xa, 0xf3, 0xa4, 0x87, 0xd0, 0x29, 0x7e, 0xc6, 0x91, 0x68, 0x3f, 0x5, 0x52, 0xab, 0xfc, 0x44, 0x13, 0xea, 0xbd},
+ {0x0, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f, 0xe9, 0xb1, 0x59, 0x1, 0x94, 0xcc, 0x24, 0x7c, 0x13, 0x4b, 0xa3, 0xfb, 0x6e, 0x36, 0xde, 0x86, 0xcf, 0x97, 0x7f, 0x27, 0xb2, 0xea, 0x2, 0x5a, 0x35, 0x6d, 0x85, 0xdd, 0x48, 0x10, 0xf8, 0xa0, 0x26, 0x7e, 0x96, 0xce, 0x5b, 0x3, 0xeb, 0xb3, 0xdc, 0x84, 0x6c, 0x34, 0xa1, 0xf9, 0x11, 0x49, 0x83, 0xdb, 0x33, 0x6b, 0xfe, 0xa6, 0x4e, 0x16, 0x79, 0x21, 0xc9, 0x91, 0x4, 0x5c, 0xb4, 0xec, 0x6a, 0x32, 0xda, 0x82, 0x17, 0x4f, 0xa7, 0xff, 0x90, 0xc8, 0x20, 0x78, 0xed, 0xb5, 0x5d, 0x5, 0x4c, 0x14, 0xfc, 0xa4, 0x31, 0x69, 0x81, 0xd9, 0xb6, 0xee, 0x6, 0x5e, 0xcb, 0x93, 0x7b, 0x23, 0xa5, 0xfd, 0x15, 0x4d, 0xd8, 0x80, 0x68, 0x30, 0x5f, 0x7, 0xef, 0xb7, 0x22, 0x7a, 0x92, 0xca, 0x1b, 0x43, 0xab, 0xf3, 0x66, 0x3e, 0xd6, 0x8e, 0xe1, 0xb9, 0x51, 0x9, 0x9c, 0xc4, 0x2c, 0x74, 0xf2, 0xaa, 0x42, 0x1a, 0x8f, 0xd7, 0x3f, 0x67, 0x8, 0x50, 0xb8, 0xe0, 0x75, 0x2d, 0xc5, 0x9d, 0xd4, 0x8c, 0x64, 0x3c, 0xa9, 0xf1, 0x19, 0x41, 0x2e, 0x76, 0x9e, 0xc6, 0x53, 0xb, 0xe3, 0xbb, 0x3d, 0x65, 0x8d, 0xd5, 0x40, 0x18, 0xf0, 0xa8, 0xc7, 0x9f, 0x77, 0x2f, 0xba, 0xe2, 0xa, 0x52, 0x98, 0xc0, 0x28, 0x70, 0xe5, 0xbd, 0x55, 0xd, 0x62, 0x3a, 0xd2, 0x8a, 0x1f, 0x47, 0xaf, 0xf7, 0x71, 0x29, 0xc1, 0x99, 0xc, 0x54, 0xbc, 0xe4, 0x8b, 0xd3, 0x3b, 0x63, 0xf6, 0xae, 0x46, 0x1e, 0x57, 0xf, 0xe7, 0xbf, 0x2a, 0x72, 0x9a, 0xc2, 0xad, 0xf5, 0x1d, 0x45, 0xd0, 0x88, 0x60, 0x38, 0xbe, 0xe6, 0xe, 0x56, 0xc3, 0x9b, 0x73, 0x2b, 0x44, 0x1c, 0xf4, 0xac, 0x39, 0x61, 0x89, 0xd1},
+ {0x0, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60, 0xf9, 0xa0, 0x4b, 0x12, 0x80, 0xd9, 0x32, 0x6b, 0xb, 0x52, 0xb9, 0xe0, 0x72, 0x2b, 0xc0, 0x99, 0xef, 0xb6, 0x5d, 0x4, 0x96, 0xcf, 0x24, 0x7d, 0x1d, 0x44, 0xaf, 0xf6, 0x64, 0x3d, 0xd6, 0x8f, 0x16, 0x4f, 0xa4, 0xfd, 0x6f, 0x36, 0xdd, 0x84, 0xe4, 0xbd, 0x56, 0xf, 0x9d, 0xc4, 0x2f, 0x76, 0xc3, 0x9a, 0x71, 0x28, 0xba, 0xe3, 0x8, 0x51, 0x31, 0x68, 0x83, 0xda, 0x48, 0x11, 0xfa, 0xa3, 0x3a, 0x63, 0x88, 0xd1, 0x43, 0x1a, 0xf1, 0xa8, 0xc8, 0x91, 0x7a, 0x23, 0xb1, 0xe8, 0x3, 0x5a, 0x2c, 0x75, 0x9e, 0xc7, 0x55, 0xc, 0xe7, 0xbe, 0xde, 0x87, 0x6c, 0x35, 0xa7, 0xfe, 0x15, 0x4c, 0xd5, 0x8c, 0x67, 0x3e, 0xac, 0xf5, 0x1e, 0x47, 0x27, 0x7e, 0x95, 0xcc, 0x5e, 0x7, 0xec, 0xb5, 0x9b, 0xc2, 0x29, 0x70, 0xe2, 0xbb, 0x50, 0x9, 0x69, 0x30, 0xdb, 0x82, 0x10, 0x49, 0xa2, 0xfb, 0x62, 0x3b, 0xd0, 0x89, 0x1b, 0x42, 0xa9, 0xf0, 0x90, 0xc9, 0x22, 0x7b, 0xe9, 0xb0, 0x5b, 0x2, 0x74, 0x2d, 0xc6, 0x9f, 0xd, 0x54, 0xbf, 0xe6, 0x86, 0xdf, 0x34, 0x6d, 0xff, 0xa6, 0x4d, 0x14, 0x8d, 0xd4, 0x3f, 0x66, 0xf4, 0xad, 0x46, 0x1f, 0x7f, 0x26, 0xcd, 0x94, 0x6, 0x5f, 0xb4, 0xed, 0x58, 0x1, 0xea, 0xb3, 0x21, 0x78, 0x93, 0xca, 0xaa, 0xf3, 0x18, 0x41, 0xd3, 0x8a, 0x61, 0x38, 0xa1, 0xf8, 0x13, 0x4a, 0xd8, 0x81, 0x6a, 0x33, 0x53, 0xa, 0xe1, 0xb8, 0x2a, 0x73, 0x98, 0xc1, 0xb7, 0xee, 0x5, 0x5c, 0xce, 0x97, 0x7c, 0x25, 0x45, 0x1c, 0xf7, 0xae, 0x3c, 0x65, 0x8e, 0xd7, 0x4e, 0x17, 0xfc, 0xa5, 0x37, 0x6e, 0x85, 0xdc, 0xbc, 0xe5, 0xe, 0x57, 0xc5, 0x9c, 0x77, 0x2e},
+ {0x0, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x4, 0x9f, 0xc5, 0x2b, 0x71, 0xc9, 0x93, 0x7d, 0x27, 0xbc, 0xe6, 0x8, 0x52, 0x23, 0x79, 0x97, 0xcd, 0x56, 0xc, 0xe2, 0xb8, 0x8f, 0xd5, 0x3b, 0x61, 0xfa, 0xa0, 0x4e, 0x14, 0x65, 0x3f, 0xd1, 0x8b, 0x10, 0x4a, 0xa4, 0xfe, 0x46, 0x1c, 0xf2, 0xa8, 0x33, 0x69, 0x87, 0xdd, 0xac, 0xf6, 0x18, 0x42, 0xd9, 0x83, 0x6d, 0x37, 0x3, 0x59, 0xb7, 0xed, 0x76, 0x2c, 0xc2, 0x98, 0xe9, 0xb3, 0x5d, 0x7, 0x9c, 0xc6, 0x28, 0x72, 0xca, 0x90, 0x7e, 0x24, 0xbf, 0xe5, 0xb, 0x51, 0x20, 0x7a, 0x94, 0xce, 0x55, 0xf, 0xe1, 0xbb, 0x8c, 0xd6, 0x38, 0x62, 0xf9, 0xa3, 0x4d, 0x17, 0x66, 0x3c, 0xd2, 0x88, 0x13, 0x49, 0xa7, 0xfd, 0x45, 0x1f, 0xf1, 0xab, 0x30, 0x6a, 0x84, 0xde, 0xaf, 0xf5, 0x1b, 0x41, 0xda, 0x80, 0x6e, 0x34, 0x6, 0x5c, 0xb2, 0xe8, 0x73, 0x29, 0xc7, 0x9d, 0xec, 0xb6, 0x58, 0x2, 0x99, 0xc3, 0x2d, 0x77, 0xcf, 0x95, 0x7b, 0x21, 0xba, 0xe0, 0xe, 0x54, 0x25, 0x7f, 0x91, 0xcb, 0x50, 0xa, 0xe4, 0xbe, 0x89, 0xd3, 0x3d, 0x67, 0xfc, 0xa6, 0x48, 0x12, 0x63, 0x39, 0xd7, 0x8d, 0x16, 0x4c, 0xa2, 0xf8, 0x40, 0x1a, 0xf4, 0xae, 0x35, 0x6f, 0x81, 0xdb, 0xaa, 0xf0, 0x1e, 0x44, 0xdf, 0x85, 0x6b, 0x31, 0x5, 0x5f, 0xb1, 0xeb, 0x70, 0x2a, 0xc4, 0x9e, 0xef, 0xb5, 0x5b, 0x1, 0x9a, 0xc0, 0x2e, 0x74, 0xcc, 0x96, 0x78, 0x22, 0xb9, 0xe3, 0xd, 0x57, 0x26, 0x7c, 0x92, 0xc8, 0x53, 0x9, 0xe7, 0xbd, 0x8a, 0xd0, 0x3e, 0x64, 0xff, 0xa5, 0x4b, 0x11, 0x60, 0x3a, 0xd4, 0x8e, 0x15, 0x4f, 0xa1, 0xfb, 0x43, 0x19, 0xf7, 0xad, 0x36, 0x6c, 0x82, 0xd8, 0xa9, 0xf3, 0x1d, 0x47, 0xdc, 0x86, 0x68, 0x32},
+ {0x0, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0xf, 0x93, 0xc8, 0x25, 0x7e, 0xd9, 0x82, 0x6f, 0x34, 0xa8, 0xf3, 0x1e, 0x45, 0x3b, 0x60, 0x8d, 0xd6, 0x4a, 0x11, 0xfc, 0xa7, 0xaf, 0xf4, 0x19, 0x42, 0xde, 0x85, 0x68, 0x33, 0x4d, 0x16, 0xfb, 0xa0, 0x3c, 0x67, 0x8a, 0xd1, 0x76, 0x2d, 0xc0, 0x9b, 0x7, 0x5c, 0xb1, 0xea, 0x94, 0xcf, 0x22, 0x79, 0xe5, 0xbe, 0x53, 0x8, 0x43, 0x18, 0xf5, 0xae, 0x32, 0x69, 0x84, 0xdf, 0xa1, 0xfa, 0x17, 0x4c, 0xd0, 0x8b, 0x66, 0x3d, 0x9a, 0xc1, 0x2c, 0x77, 0xeb, 0xb0, 0x5d, 0x6, 0x78, 0x23, 0xce, 0x95, 0x9, 0x52, 0xbf, 0xe4, 0xec, 0xb7, 0x5a, 0x1, 0x9d, 0xc6, 0x2b, 0x70, 0xe, 0x55, 0xb8, 0xe3, 0x7f, 0x24, 0xc9, 0x92, 0x35, 0x6e, 0x83, 0xd8, 0x44, 0x1f, 0xf2, 0xa9, 0xd7, 0x8c, 0x61, 0x3a, 0xa6, 0xfd, 0x10, 0x4b, 0x86, 0xdd, 0x30, 0x6b, 0xf7, 0xac, 0x41, 0x1a, 0x64, 0x3f, 0xd2, 0x89, 0x15, 0x4e, 0xa3, 0xf8, 0x5f, 0x4, 0xe9, 0xb2, 0x2e, 0x75, 0x98, 0xc3, 0xbd, 0xe6, 0xb, 0x50, 0xcc, 0x97, 0x7a, 0x21, 0x29, 0x72, 0x9f, 0xc4, 0x58, 0x3, 0xee, 0xb5, 0xcb, 0x90, 0x7d, 0x26, 0xba, 0xe1, 0xc, 0x57, 0xf0, 0xab, 0x46, 0x1d, 0x81, 0xda, 0x37, 0x6c, 0x12, 0x49, 0xa4, 0xff, 0x63, 0x38, 0xd5, 0x8e, 0xc5, 0x9e, 0x73, 0x28, 0xb4, 0xef, 0x2, 0x59, 0x27, 0x7c, 0x91, 0xca, 0x56, 0xd, 0xe0, 0xbb, 0x1c, 0x47, 0xaa, 0xf1, 0x6d, 0x36, 0xdb, 0x80, 0xfe, 0xa5, 0x48, 0x13, 0x8f, 0xd4, 0x39, 0x62, 0x6a, 0x31, 0xdc, 0x87, 0x1b, 0x40, 0xad, 0xf6, 0x88, 0xd3, 0x3e, 0x65, 0xf9, 0xa2, 0x4f, 0x14, 0xb3, 0xe8, 0x5, 0x5e, 0xc2, 0x99, 0x74, 0x2f, 0x51, 0xa, 0xe7, 0xbc, 0x20, 0x7b, 0x96, 0xcd},
+ {0x0, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0xf, 0x53, 0xa9, 0xf5, 0x11, 0x4d, 0xc4, 0x98, 0x7c, 0x20, 0x73, 0x2f, 0xcb, 0x97, 0x1e, 0x42, 0xa6, 0xfa, 0x4f, 0x13, 0xf7, 0xab, 0x22, 0x7e, 0x9a, 0xc6, 0x95, 0xc9, 0x2d, 0x71, 0xf8, 0xa4, 0x40, 0x1c, 0xe6, 0xba, 0x5e, 0x2, 0x8b, 0xd7, 0x33, 0x6f, 0x3c, 0x60, 0x84, 0xd8, 0x51, 0xd, 0xe9, 0xb5, 0x9e, 0xc2, 0x26, 0x7a, 0xf3, 0xaf, 0x4b, 0x17, 0x44, 0x18, 0xfc, 0xa0, 0x29, 0x75, 0x91, 0xcd, 0x37, 0x6b, 0x8f, 0xd3, 0x5a, 0x6, 0xe2, 0xbe, 0xed, 0xb1, 0x55, 0x9, 0x80, 0xdc, 0x38, 0x64, 0xd1, 0x8d, 0x69, 0x35, 0xbc, 0xe0, 0x4, 0x58, 0xb, 0x57, 0xb3, 0xef, 0x66, 0x3a, 0xde, 0x82, 0x78, 0x24, 0xc0, 0x9c, 0x15, 0x49, 0xad, 0xf1, 0xa2, 0xfe, 0x1a, 0x46, 0xcf, 0x93, 0x77, 0x2b, 0x21, 0x7d, 0x99, 0xc5, 0x4c, 0x10, 0xf4, 0xa8, 0xfb, 0xa7, 0x43, 0x1f, 0x96, 0xca, 0x2e, 0x72, 0x88, 0xd4, 0x30, 0x6c, 0xe5, 0xb9, 0x5d, 0x1, 0x52, 0xe, 0xea, 0xb6, 0x3f, 0x63, 0x87, 0xdb, 0x6e, 0x32, 0xd6, 0x8a, 0x3, 0x5f, 0xbb, 0xe7, 0xb4, 0xe8, 0xc, 0x50, 0xd9, 0x85, 0x61, 0x3d, 0xc7, 0x9b, 0x7f, 0x23, 0xaa, 0xf6, 0x12, 0x4e, 0x1d, 0x41, 0xa5, 0xf9, 0x70, 0x2c, 0xc8, 0x94, 0xbf, 0xe3, 0x7, 0x5b, 0xd2, 0x8e, 0x6a, 0x36, 0x65, 0x39, 0xdd, 0x81, 0x8, 0x54, 0xb0, 0xec, 0x16, 0x4a, 0xae, 0xf2, 0x7b, 0x27, 0xc3, 0x9f, 0xcc, 0x90, 0x74, 0x28, 0xa1, 0xfd, 0x19, 0x45, 0xf0, 0xac, 0x48, 0x14, 0x9d, 0xc1, 0x25, 0x79, 0x2a, 0x76, 0x92, 0xce, 0x47, 0x1b, 0xff, 0xa3, 0x59, 0x5, 0xe1, 0xbd, 0x34, 0x68, 0x8c, 0xd0, 0x83, 0xdf, 0x3b, 0x67, 0xee, 0xb2, 0x56, 0xa},
+ {0x0, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x1, 0x5c, 0xb9, 0xe4, 0x3, 0x5e, 0xd0, 0x8d, 0x6a, 0x37, 0x6b, 0x36, 0xd1, 0x8c, 0x2, 0x5f, 0xb8, 0xe5, 0x6f, 0x32, 0xd5, 0x88, 0x6, 0x5b, 0xbc, 0xe1, 0xbd, 0xe0, 0x7, 0x5a, 0xd4, 0x89, 0x6e, 0x33, 0xd6, 0x8b, 0x6c, 0x31, 0xbf, 0xe2, 0x5, 0x58, 0x4, 0x59, 0xbe, 0xe3, 0x6d, 0x30, 0xd7, 0x8a, 0xde, 0x83, 0x64, 0x39, 0xb7, 0xea, 0xd, 0x50, 0xc, 0x51, 0xb6, 0xeb, 0x65, 0x38, 0xdf, 0x82, 0x67, 0x3a, 0xdd, 0x80, 0xe, 0x53, 0xb4, 0xe9, 0xb5, 0xe8, 0xf, 0x52, 0xdc, 0x81, 0x66, 0x3b, 0xb1, 0xec, 0xb, 0x56, 0xd8, 0x85, 0x62, 0x3f, 0x63, 0x3e, 0xd9, 0x84, 0xa, 0x57, 0xb0, 0xed, 0x8, 0x55, 0xb2, 0xef, 0x61, 0x3c, 0xdb, 0x86, 0xda, 0x87, 0x60, 0x3d, 0xb3, 0xee, 0x9, 0x54, 0xa1, 0xfc, 0x1b, 0x46, 0xc8, 0x95, 0x72, 0x2f, 0x73, 0x2e, 0xc9, 0x94, 0x1a, 0x47, 0xa0, 0xfd, 0x18, 0x45, 0xa2, 0xff, 0x71, 0x2c, 0xcb, 0x96, 0xca, 0x97, 0x70, 0x2d, 0xa3, 0xfe, 0x19, 0x44, 0xce, 0x93, 0x74, 0x29, 0xa7, 0xfa, 0x1d, 0x40, 0x1c, 0x41, 0xa6, 0xfb, 0x75, 0x28, 0xcf, 0x92, 0x77, 0x2a, 0xcd, 0x90, 0x1e, 0x43, 0xa4, 0xf9, 0xa5, 0xf8, 0x1f, 0x42, 0xcc, 0x91, 0x76, 0x2b, 0x7f, 0x22, 0xc5, 0x98, 0x16, 0x4b, 0xac, 0xf1, 0xad, 0xf0, 0x17, 0x4a, 0xc4, 0x99, 0x7e, 0x23, 0xc6, 0x9b, 0x7c, 0x21, 0xaf, 0xf2, 0x15, 0x48, 0x14, 0x49, 0xae, 0xf3, 0x7d, 0x20, 0xc7, 0x9a, 0x10, 0x4d, 0xaa, 0xf7, 0x79, 0x24, 0xc3, 0x9e, 0xc2, 0x9f, 0x78, 0x25, 0xab, 0xf6, 0x11, 0x4c, 0xa9, 0xf4, 0x13, 0x4e, 0xc0, 0x9d, 0x7a, 0x27, 0x7b, 0x26, 0xc1, 0x9c, 0x12, 0x4f, 0xa8, 0xf5},
+ {0x0, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d, 0x89, 0xd7, 0x35, 0x6b, 0xec, 0xb2, 0x50, 0xe, 0x43, 0x1d, 0xff, 0xa1, 0x26, 0x78, 0x9a, 0xc4, 0xf, 0x51, 0xb3, 0xed, 0x6a, 0x34, 0xd6, 0x88, 0xc5, 0x9b, 0x79, 0x27, 0xa0, 0xfe, 0x1c, 0x42, 0x86, 0xd8, 0x3a, 0x64, 0xe3, 0xbd, 0x5f, 0x1, 0x4c, 0x12, 0xf0, 0xae, 0x29, 0x77, 0x95, 0xcb, 0x1e, 0x40, 0xa2, 0xfc, 0x7b, 0x25, 0xc7, 0x99, 0xd4, 0x8a, 0x68, 0x36, 0xb1, 0xef, 0xd, 0x53, 0x97, 0xc9, 0x2b, 0x75, 0xf2, 0xac, 0x4e, 0x10, 0x5d, 0x3, 0xe1, 0xbf, 0x38, 0x66, 0x84, 0xda, 0x11, 0x4f, 0xad, 0xf3, 0x74, 0x2a, 0xc8, 0x96, 0xdb, 0x85, 0x67, 0x39, 0xbe, 0xe0, 0x2, 0x5c, 0x98, 0xc6, 0x24, 0x7a, 0xfd, 0xa3, 0x41, 0x1f, 0x52, 0xc, 0xee, 0xb0, 0x37, 0x69, 0x8b, 0xd5, 0x3c, 0x62, 0x80, 0xde, 0x59, 0x7, 0xe5, 0xbb, 0xf6, 0xa8, 0x4a, 0x14, 0x93, 0xcd, 0x2f, 0x71, 0xb5, 0xeb, 0x9, 0x57, 0xd0, 0x8e, 0x6c, 0x32, 0x7f, 0x21, 0xc3, 0x9d, 0x1a, 0x44, 0xa6, 0xf8, 0x33, 0x6d, 0x8f, 0xd1, 0x56, 0x8, 0xea, 0xb4, 0xf9, 0xa7, 0x45, 0x1b, 0x9c, 0xc2, 0x20, 0x7e, 0xba, 0xe4, 0x6, 0x58, 0xdf, 0x81, 0x63, 0x3d, 0x70, 0x2e, 0xcc, 0x92, 0x15, 0x4b, 0xa9, 0xf7, 0x22, 0x7c, 0x9e, 0xc0, 0x47, 0x19, 0xfb, 0xa5, 0xe8, 0xb6, 0x54, 0xa, 0x8d, 0xd3, 0x31, 0x6f, 0xab, 0xf5, 0x17, 0x49, 0xce, 0x90, 0x72, 0x2c, 0x61, 0x3f, 0xdd, 0x83, 0x4, 0x5a, 0xb8, 0xe6, 0x2d, 0x73, 0x91, 0xcf, 0x48, 0x16, 0xf4, 0xaa, 0xe7, 0xb9, 0x5b, 0x5, 0x82, 0xdc, 0x3e, 0x60, 0xa4, 0xfa, 0x18, 0x46, 0xc1, 0x9f, 0x7d, 0x23, 0x6e, 0x30, 0xd2, 0x8c, 0xb, 0x55, 0xb7, 0xe9},
+ {0x0, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42, 0x99, 0xc6, 0x27, 0x78, 0xf8, 0xa7, 0x46, 0x19, 0x5b, 0x4, 0xe5, 0xba, 0x3a, 0x65, 0x84, 0xdb, 0x2f, 0x70, 0x91, 0xce, 0x4e, 0x11, 0xf0, 0xaf, 0xed, 0xb2, 0x53, 0xc, 0x8c, 0xd3, 0x32, 0x6d, 0xb6, 0xe9, 0x8, 0x57, 0xd7, 0x88, 0x69, 0x36, 0x74, 0x2b, 0xca, 0x95, 0x15, 0x4a, 0xab, 0xf4, 0x5e, 0x1, 0xe0, 0xbf, 0x3f, 0x60, 0x81, 0xde, 0x9c, 0xc3, 0x22, 0x7d, 0xfd, 0xa2, 0x43, 0x1c, 0xc7, 0x98, 0x79, 0x26, 0xa6, 0xf9, 0x18, 0x47, 0x5, 0x5a, 0xbb, 0xe4, 0x64, 0x3b, 0xda, 0x85, 0x71, 0x2e, 0xcf, 0x90, 0x10, 0x4f, 0xae, 0xf1, 0xb3, 0xec, 0xd, 0x52, 0xd2, 0x8d, 0x6c, 0x33, 0xe8, 0xb7, 0x56, 0x9, 0x89, 0xd6, 0x37, 0x68, 0x2a, 0x75, 0x94, 0xcb, 0x4b, 0x14, 0xf5, 0xaa, 0xbc, 0xe3, 0x2, 0x5d, 0xdd, 0x82, 0x63, 0x3c, 0x7e, 0x21, 0xc0, 0x9f, 0x1f, 0x40, 0xa1, 0xfe, 0x25, 0x7a, 0x9b, 0xc4, 0x44, 0x1b, 0xfa, 0xa5, 0xe7, 0xb8, 0x59, 0x6, 0x86, 0xd9, 0x38, 0x67, 0x93, 0xcc, 0x2d, 0x72, 0xf2, 0xad, 0x4c, 0x13, 0x51, 0xe, 0xef, 0xb0, 0x30, 0x6f, 0x8e, 0xd1, 0xa, 0x55, 0xb4, 0xeb, 0x6b, 0x34, 0xd5, 0x8a, 0xc8, 0x97, 0x76, 0x29, 0xa9, 0xf6, 0x17, 0x48, 0xe2, 0xbd, 0x5c, 0x3, 0x83, 0xdc, 0x3d, 0x62, 0x20, 0x7f, 0x9e, 0xc1, 0x41, 0x1e, 0xff, 0xa0, 0x7b, 0x24, 0xc5, 0x9a, 0x1a, 0x45, 0xa4, 0xfb, 0xb9, 0xe6, 0x7, 0x58, 0xd8, 0x87, 0x66, 0x39, 0xcd, 0x92, 0x73, 0x2c, 0xac, 0xf3, 0x12, 0x4d, 0xf, 0x50, 0xb1, 0xee, 0x6e, 0x31, 0xd0, 0x8f, 0x54, 0xb, 0xea, 0xb5, 0x35, 0x6a, 0x8b, 0xd4, 0x96, 0xc9, 0x28, 0x77, 0xf7, 0xa8, 0x49, 0x16},
+ {0x0, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a, 0x4e, 0x2e, 0x8e, 0xee, 0xd3, 0xb3, 0x13, 0x73, 0x69, 0x9, 0xa9, 0xc9, 0xf4, 0x94, 0x34, 0x54, 0x9c, 0xfc, 0x5c, 0x3c, 0x1, 0x61, 0xc1, 0xa1, 0xbb, 0xdb, 0x7b, 0x1b, 0x26, 0x46, 0xe6, 0x86, 0xd2, 0xb2, 0x12, 0x72, 0x4f, 0x2f, 0x8f, 0xef, 0xf5, 0x95, 0x35, 0x55, 0x68, 0x8, 0xa8, 0xc8, 0x25, 0x45, 0xe5, 0x85, 0xb8, 0xd8, 0x78, 0x18, 0x2, 0x62, 0xc2, 0xa2, 0x9f, 0xff, 0x5f, 0x3f, 0x6b, 0xb, 0xab, 0xcb, 0xf6, 0x96, 0x36, 0x56, 0x4c, 0x2c, 0x8c, 0xec, 0xd1, 0xb1, 0x11, 0x71, 0xb9, 0xd9, 0x79, 0x19, 0x24, 0x44, 0xe4, 0x84, 0x9e, 0xfe, 0x5e, 0x3e, 0x3, 0x63, 0xc3, 0xa3, 0xf7, 0x97, 0x37, 0x57, 0x6a, 0xa, 0xaa, 0xca, 0xd0, 0xb0, 0x10, 0x70, 0x4d, 0x2d, 0x8d, 0xed, 0x4a, 0x2a, 0x8a, 0xea, 0xd7, 0xb7, 0x17, 0x77, 0x6d, 0xd, 0xad, 0xcd, 0xf0, 0x90, 0x30, 0x50, 0x4, 0x64, 0xc4, 0xa4, 0x99, 0xf9, 0x59, 0x39, 0x23, 0x43, 0xe3, 0x83, 0xbe, 0xde, 0x7e, 0x1e, 0xd6, 0xb6, 0x16, 0x76, 0x4b, 0x2b, 0x8b, 0xeb, 0xf1, 0x91, 0x31, 0x51, 0x6c, 0xc, 0xac, 0xcc, 0x98, 0xf8, 0x58, 0x38, 0x5, 0x65, 0xc5, 0xa5, 0xbf, 0xdf, 0x7f, 0x1f, 0x22, 0x42, 0xe2, 0x82, 0x6f, 0xf, 0xaf, 0xcf, 0xf2, 0x92, 0x32, 0x52, 0x48, 0x28, 0x88, 0xe8, 0xd5, 0xb5, 0x15, 0x75, 0x21, 0x41, 0xe1, 0x81, 0xbc, 0xdc, 0x7c, 0x1c, 0x6, 0x66, 0xc6, 0xa6, 0x9b, 0xfb, 0x5b, 0x3b, 0xf3, 0x93, 0x33, 0x53, 0x6e, 0xe, 0xae, 0xce, 0xd4, 0xb4, 0x14, 0x74, 0x49, 0x29, 0x89, 0xe9, 0xbd, 0xdd, 0x7d, 0x1d, 0x20, 0x40, 0xe0, 0x80, 0x9a, 0xfa, 0x5a, 0x3a, 0x7, 0x67, 0xc7, 0xa7},
+ {0x0, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15, 0x5e, 0x3f, 0x9c, 0xfd, 0xc7, 0xa6, 0x5, 0x64, 0x71, 0x10, 0xb3, 0xd2, 0xe8, 0x89, 0x2a, 0x4b, 0xbc, 0xdd, 0x7e, 0x1f, 0x25, 0x44, 0xe7, 0x86, 0x93, 0xf2, 0x51, 0x30, 0xa, 0x6b, 0xc8, 0xa9, 0xe2, 0x83, 0x20, 0x41, 0x7b, 0x1a, 0xb9, 0xd8, 0xcd, 0xac, 0xf, 0x6e, 0x54, 0x35, 0x96, 0xf7, 0x65, 0x4, 0xa7, 0xc6, 0xfc, 0x9d, 0x3e, 0x5f, 0x4a, 0x2b, 0x88, 0xe9, 0xd3, 0xb2, 0x11, 0x70, 0x3b, 0x5a, 0xf9, 0x98, 0xa2, 0xc3, 0x60, 0x1, 0x14, 0x75, 0xd6, 0xb7, 0x8d, 0xec, 0x4f, 0x2e, 0xd9, 0xb8, 0x1b, 0x7a, 0x40, 0x21, 0x82, 0xe3, 0xf6, 0x97, 0x34, 0x55, 0x6f, 0xe, 0xad, 0xcc, 0x87, 0xe6, 0x45, 0x24, 0x1e, 0x7f, 0xdc, 0xbd, 0xa8, 0xc9, 0x6a, 0xb, 0x31, 0x50, 0xf3, 0x92, 0xca, 0xab, 0x8, 0x69, 0x53, 0x32, 0x91, 0xf0, 0xe5, 0x84, 0x27, 0x46, 0x7c, 0x1d, 0xbe, 0xdf, 0x94, 0xf5, 0x56, 0x37, 0xd, 0x6c, 0xcf, 0xae, 0xbb, 0xda, 0x79, 0x18, 0x22, 0x43, 0xe0, 0x81, 0x76, 0x17, 0xb4, 0xd5, 0xef, 0x8e, 0x2d, 0x4c, 0x59, 0x38, 0x9b, 0xfa, 0xc0, 0xa1, 0x2, 0x63, 0x28, 0x49, 0xea, 0x8b, 0xb1, 0xd0, 0x73, 0x12, 0x7, 0x66, 0xc5, 0xa4, 0x9e, 0xff, 0x5c, 0x3d, 0xaf, 0xce, 0x6d, 0xc, 0x36, 0x57, 0xf4, 0x95, 0x80, 0xe1, 0x42, 0x23, 0x19, 0x78, 0xdb, 0xba, 0xf1, 0x90, 0x33, 0x52, 0x68, 0x9, 0xaa, 0xcb, 0xde, 0xbf, 0x1c, 0x7d, 0x47, 0x26, 0x85, 0xe4, 0x13, 0x72, 0xd1, 0xb0, 0x8a, 0xeb, 0x48, 0x29, 0x3c, 0x5d, 0xfe, 0x9f, 0xa5, 0xc4, 0x67, 0x6, 0x4d, 0x2c, 0x8f, 0xee, 0xd4, 0xb5, 0x16, 0x77, 0x62, 0x3, 0xa0, 0xc1, 0xfb, 0x9a, 0x39, 0x58},
+ {0x0, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x4, 0x6e, 0xc, 0xaa, 0xc8, 0xfb, 0x99, 0x3f, 0x5d, 0x59, 0x3b, 0x9d, 0xff, 0xcc, 0xae, 0x8, 0x6a, 0xdc, 0xbe, 0x18, 0x7a, 0x49, 0x2b, 0x8d, 0xef, 0xeb, 0x89, 0x2f, 0x4d, 0x7e, 0x1c, 0xba, 0xd8, 0xb2, 0xd0, 0x76, 0x14, 0x27, 0x45, 0xe3, 0x81, 0x85, 0xe7, 0x41, 0x23, 0x10, 0x72, 0xd4, 0xb6, 0xa5, 0xc7, 0x61, 0x3, 0x30, 0x52, 0xf4, 0x96, 0x92, 0xf0, 0x56, 0x34, 0x7, 0x65, 0xc3, 0xa1, 0xcb, 0xa9, 0xf, 0x6d, 0x5e, 0x3c, 0x9a, 0xf8, 0xfc, 0x9e, 0x38, 0x5a, 0x69, 0xb, 0xad, 0xcf, 0x79, 0x1b, 0xbd, 0xdf, 0xec, 0x8e, 0x28, 0x4a, 0x4e, 0x2c, 0x8a, 0xe8, 0xdb, 0xb9, 0x1f, 0x7d, 0x17, 0x75, 0xd3, 0xb1, 0x82, 0xe0, 0x46, 0x24, 0x20, 0x42, 0xe4, 0x86, 0xb5, 0xd7, 0x71, 0x13, 0x57, 0x35, 0x93, 0xf1, 0xc2, 0xa0, 0x6, 0x64, 0x60, 0x2, 0xa4, 0xc6, 0xf5, 0x97, 0x31, 0x53, 0x39, 0x5b, 0xfd, 0x9f, 0xac, 0xce, 0x68, 0xa, 0xe, 0x6c, 0xca, 0xa8, 0x9b, 0xf9, 0x5f, 0x3d, 0x8b, 0xe9, 0x4f, 0x2d, 0x1e, 0x7c, 0xda, 0xb8, 0xbc, 0xde, 0x78, 0x1a, 0x29, 0x4b, 0xed, 0x8f, 0xe5, 0x87, 0x21, 0x43, 0x70, 0x12, 0xb4, 0xd6, 0xd2, 0xb0, 0x16, 0x74, 0x47, 0x25, 0x83, 0xe1, 0xf2, 0x90, 0x36, 0x54, 0x67, 0x5, 0xa3, 0xc1, 0xc5, 0xa7, 0x1, 0x63, 0x50, 0x32, 0x94, 0xf6, 0x9c, 0xfe, 0x58, 0x3a, 0x9, 0x6b, 0xcd, 0xaf, 0xab, 0xc9, 0x6f, 0xd, 0x3e, 0x5c, 0xfa, 0x98, 0x2e, 0x4c, 0xea, 0x88, 0xbb, 0xd9, 0x7f, 0x1d, 0x19, 0x7b, 0xdd, 0xbf, 0x8c, 0xee, 0x48, 0x2a, 0x40, 0x22, 0x84, 0xe6, 0xd5, 0xb7, 0x11, 0x73, 0x77, 0x15, 0xb3, 0xd1, 0xe2, 0x80, 0x26, 0x44},
+ {0x0, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0xb, 0x7e, 0x1d, 0xb8, 0xdb, 0xef, 0x8c, 0x29, 0x4a, 0x41, 0x22, 0x87, 0xe4, 0xd0, 0xb3, 0x16, 0x75, 0xfc, 0x9f, 0x3a, 0x59, 0x6d, 0xe, 0xab, 0xc8, 0xc3, 0xa0, 0x5, 0x66, 0x52, 0x31, 0x94, 0xf7, 0x82, 0xe1, 0x44, 0x27, 0x13, 0x70, 0xd5, 0xb6, 0xbd, 0xde, 0x7b, 0x18, 0x2c, 0x4f, 0xea, 0x89, 0xe5, 0x86, 0x23, 0x40, 0x74, 0x17, 0xb2, 0xd1, 0xda, 0xb9, 0x1c, 0x7f, 0x4b, 0x28, 0x8d, 0xee, 0x9b, 0xf8, 0x5d, 0x3e, 0xa, 0x69, 0xcc, 0xaf, 0xa4, 0xc7, 0x62, 0x1, 0x35, 0x56, 0xf3, 0x90, 0x19, 0x7a, 0xdf, 0xbc, 0x88, 0xeb, 0x4e, 0x2d, 0x26, 0x45, 0xe0, 0x83, 0xb7, 0xd4, 0x71, 0x12, 0x67, 0x4, 0xa1, 0xc2, 0xf6, 0x95, 0x30, 0x53, 0x58, 0x3b, 0x9e, 0xfd, 0xc9, 0xaa, 0xf, 0x6c, 0xd7, 0xb4, 0x11, 0x72, 0x46, 0x25, 0x80, 0xe3, 0xe8, 0x8b, 0x2e, 0x4d, 0x79, 0x1a, 0xbf, 0xdc, 0xa9, 0xca, 0x6f, 0xc, 0x38, 0x5b, 0xfe, 0x9d, 0x96, 0xf5, 0x50, 0x33, 0x7, 0x64, 0xc1, 0xa2, 0x2b, 0x48, 0xed, 0x8e, 0xba, 0xd9, 0x7c, 0x1f, 0x14, 0x77, 0xd2, 0xb1, 0x85, 0xe6, 0x43, 0x20, 0x55, 0x36, 0x93, 0xf0, 0xc4, 0xa7, 0x2, 0x61, 0x6a, 0x9, 0xac, 0xcf, 0xfb, 0x98, 0x3d, 0x5e, 0x32, 0x51, 0xf4, 0x97, 0xa3, 0xc0, 0x65, 0x6, 0xd, 0x6e, 0xcb, 0xa8, 0x9c, 0xff, 0x5a, 0x39, 0x4c, 0x2f, 0x8a, 0xe9, 0xdd, 0xbe, 0x1b, 0x78, 0x73, 0x10, 0xb5, 0xd6, 0xe2, 0x81, 0x24, 0x47, 0xce, 0xad, 0x8, 0x6b, 0x5f, 0x3c, 0x99, 0xfa, 0xf1, 0x92, 0x37, 0x54, 0x60, 0x3, 0xa6, 0xc5, 0xb0, 0xd3, 0x76, 0x15, 0x21, 0x42, 0xe7, 0x84, 0x8f, 0xec, 0x49, 0x2a, 0x1e, 0x7d, 0xd8, 0xbb},
+ {0x0, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x7, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26, 0xe, 0x6a, 0xc6, 0xa2, 0x83, 0xe7, 0x4b, 0x2f, 0x9, 0x6d, 0xc1, 0xa5, 0x84, 0xe0, 0x4c, 0x28, 0x1c, 0x78, 0xd4, 0xb0, 0x91, 0xf5, 0x59, 0x3d, 0x1b, 0x7f, 0xd3, 0xb7, 0x96, 0xf2, 0x5e, 0x3a, 0x12, 0x76, 0xda, 0xbe, 0x9f, 0xfb, 0x57, 0x33, 0x15, 0x71, 0xdd, 0xb9, 0x98, 0xfc, 0x50, 0x34, 0x38, 0x5c, 0xf0, 0x94, 0xb5, 0xd1, 0x7d, 0x19, 0x3f, 0x5b, 0xf7, 0x93, 0xb2, 0xd6, 0x7a, 0x1e, 0x36, 0x52, 0xfe, 0x9a, 0xbb, 0xdf, 0x73, 0x17, 0x31, 0x55, 0xf9, 0x9d, 0xbc, 0xd8, 0x74, 0x10, 0x24, 0x40, 0xec, 0x88, 0xa9, 0xcd, 0x61, 0x5, 0x23, 0x47, 0xeb, 0x8f, 0xae, 0xca, 0x66, 0x2, 0x2a, 0x4e, 0xe2, 0x86, 0xa7, 0xc3, 0x6f, 0xb, 0x2d, 0x49, 0xe5, 0x81, 0xa0, 0xc4, 0x68, 0xc, 0x70, 0x14, 0xb8, 0xdc, 0xfd, 0x99, 0x35, 0x51, 0x77, 0x13, 0xbf, 0xdb, 0xfa, 0x9e, 0x32, 0x56, 0x7e, 0x1a, 0xb6, 0xd2, 0xf3, 0x97, 0x3b, 0x5f, 0x79, 0x1d, 0xb1, 0xd5, 0xf4, 0x90, 0x3c, 0x58, 0x6c, 0x8, 0xa4, 0xc0, 0xe1, 0x85, 0x29, 0x4d, 0x6b, 0xf, 0xa3, 0xc7, 0xe6, 0x82, 0x2e, 0x4a, 0x62, 0x6, 0xaa, 0xce, 0xef, 0x8b, 0x27, 0x43, 0x65, 0x1, 0xad, 0xc9, 0xe8, 0x8c, 0x20, 0x44, 0x48, 0x2c, 0x80, 0xe4, 0xc5, 0xa1, 0xd, 0x69, 0x4f, 0x2b, 0x87, 0xe3, 0xc2, 0xa6, 0xa, 0x6e, 0x46, 0x22, 0x8e, 0xea, 0xcb, 0xaf, 0x3, 0x67, 0x41, 0x25, 0x89, 0xed, 0xcc, 0xa8, 0x4, 0x60, 0x54, 0x30, 0x9c, 0xf8, 0xd9, 0xbd, 0x11, 0x75, 0x53, 0x37, 0x9b, 0xff, 0xde, 0xba, 0x16, 0x72, 0x5a, 0x3e, 0x92, 0xf6, 0xd7, 0xb3, 0x1f, 0x7b, 0x5d, 0x39, 0x95, 0xf1, 0xd0, 0xb4, 0x18, 0x7c},
+ {0x0, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0xf, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29, 0x1e, 0x7b, 0xd4, 0xb1, 0x97, 0xf2, 0x5d, 0x38, 0x11, 0x74, 0xdb, 0xbe, 0x98, 0xfd, 0x52, 0x37, 0x3c, 0x59, 0xf6, 0x93, 0xb5, 0xd0, 0x7f, 0x1a, 0x33, 0x56, 0xf9, 0x9c, 0xba, 0xdf, 0x70, 0x15, 0x22, 0x47, 0xe8, 0x8d, 0xab, 0xce, 0x61, 0x4, 0x2d, 0x48, 0xe7, 0x82, 0xa4, 0xc1, 0x6e, 0xb, 0x78, 0x1d, 0xb2, 0xd7, 0xf1, 0x94, 0x3b, 0x5e, 0x77, 0x12, 0xbd, 0xd8, 0xfe, 0x9b, 0x34, 0x51, 0x66, 0x3, 0xac, 0xc9, 0xef, 0x8a, 0x25, 0x40, 0x69, 0xc, 0xa3, 0xc6, 0xe0, 0x85, 0x2a, 0x4f, 0x44, 0x21, 0x8e, 0xeb, 0xcd, 0xa8, 0x7, 0x62, 0x4b, 0x2e, 0x81, 0xe4, 0xc2, 0xa7, 0x8, 0x6d, 0x5a, 0x3f, 0x90, 0xf5, 0xd3, 0xb6, 0x19, 0x7c, 0x55, 0x30, 0x9f, 0xfa, 0xdc, 0xb9, 0x16, 0x73, 0xf0, 0x95, 0x3a, 0x5f, 0x79, 0x1c, 0xb3, 0xd6, 0xff, 0x9a, 0x35, 0x50, 0x76, 0x13, 0xbc, 0xd9, 0xee, 0x8b, 0x24, 0x41, 0x67, 0x2, 0xad, 0xc8, 0xe1, 0x84, 0x2b, 0x4e, 0x68, 0xd, 0xa2, 0xc7, 0xcc, 0xa9, 0x6, 0x63, 0x45, 0x20, 0x8f, 0xea, 0xc3, 0xa6, 0x9, 0x6c, 0x4a, 0x2f, 0x80, 0xe5, 0xd2, 0xb7, 0x18, 0x7d, 0x5b, 0x3e, 0x91, 0xf4, 0xdd, 0xb8, 0x17, 0x72, 0x54, 0x31, 0x9e, 0xfb, 0x88, 0xed, 0x42, 0x27, 0x1, 0x64, 0xcb, 0xae, 0x87, 0xe2, 0x4d, 0x28, 0xe, 0x6b, 0xc4, 0xa1, 0x96, 0xf3, 0x5c, 0x39, 0x1f, 0x7a, 0xd5, 0xb0, 0x99, 0xfc, 0x53, 0x36, 0x10, 0x75, 0xda, 0xbf, 0xb4, 0xd1, 0x7e, 0x1b, 0x3d, 0x58, 0xf7, 0x92, 0xbb, 0xde, 0x71, 0x14, 0x32, 0x57, 0xf8, 0x9d, 0xaa, 0xcf, 0x60, 0x5, 0x23, 0x46, 0xe9, 0x8c, 0xa5, 0xc0, 0x6f, 0xa, 0x2c, 0x49, 0xe6, 0x83},
+ {0x0, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38, 0x2e, 0x48, 0xe2, 0x84, 0xab, 0xcd, 0x67, 0x1, 0x39, 0x5f, 0xf5, 0x93, 0xbc, 0xda, 0x70, 0x16, 0x5c, 0x3a, 0x90, 0xf6, 0xd9, 0xbf, 0x15, 0x73, 0x4b, 0x2d, 0x87, 0xe1, 0xce, 0xa8, 0x2, 0x64, 0x72, 0x14, 0xbe, 0xd8, 0xf7, 0x91, 0x3b, 0x5d, 0x65, 0x3, 0xa9, 0xcf, 0xe0, 0x86, 0x2c, 0x4a, 0xb8, 0xde, 0x74, 0x12, 0x3d, 0x5b, 0xf1, 0x97, 0xaf, 0xc9, 0x63, 0x5, 0x2a, 0x4c, 0xe6, 0x80, 0x96, 0xf0, 0x5a, 0x3c, 0x13, 0x75, 0xdf, 0xb9, 0x81, 0xe7, 0x4d, 0x2b, 0x4, 0x62, 0xc8, 0xae, 0xe4, 0x82, 0x28, 0x4e, 0x61, 0x7, 0xad, 0xcb, 0xf3, 0x95, 0x3f, 0x59, 0x76, 0x10, 0xba, 0xdc, 0xca, 0xac, 0x6, 0x60, 0x4f, 0x29, 0x83, 0xe5, 0xdd, 0xbb, 0x11, 0x77, 0x58, 0x3e, 0x94, 0xf2, 0x6d, 0xb, 0xa1, 0xc7, 0xe8, 0x8e, 0x24, 0x42, 0x7a, 0x1c, 0xb6, 0xd0, 0xff, 0x99, 0x33, 0x55, 0x43, 0x25, 0x8f, 0xe9, 0xc6, 0xa0, 0xa, 0x6c, 0x54, 0x32, 0x98, 0xfe, 0xd1, 0xb7, 0x1d, 0x7b, 0x31, 0x57, 0xfd, 0x9b, 0xb4, 0xd2, 0x78, 0x1e, 0x26, 0x40, 0xea, 0x8c, 0xa3, 0xc5, 0x6f, 0x9, 0x1f, 0x79, 0xd3, 0xb5, 0x9a, 0xfc, 0x56, 0x30, 0x8, 0x6e, 0xc4, 0xa2, 0x8d, 0xeb, 0x41, 0x27, 0xd5, 0xb3, 0x19, 0x7f, 0x50, 0x36, 0x9c, 0xfa, 0xc2, 0xa4, 0xe, 0x68, 0x47, 0x21, 0x8b, 0xed, 0xfb, 0x9d, 0x37, 0x51, 0x7e, 0x18, 0xb2, 0xd4, 0xec, 0x8a, 0x20, 0x46, 0x69, 0xf, 0xa5, 0xc3, 0x89, 0xef, 0x45, 0x23, 0xc, 0x6a, 0xc0, 0xa6, 0x9e, 0xf8, 0x52, 0x34, 0x1b, 0x7d, 0xd7, 0xb1, 0xa7, 0xc1, 0x6b, 0xd, 0x22, 0x44, 0xee, 0x88, 0xb0, 0xd6, 0x7c, 0x1a, 0x35, 0x53, 0xf9, 0x9f},
+ {0x0, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37, 0x3e, 0x59, 0xf0, 0x97, 0xbf, 0xd8, 0x71, 0x16, 0x21, 0x46, 0xef, 0x88, 0xa0, 0xc7, 0x6e, 0x9, 0x7c, 0x1b, 0xb2, 0xd5, 0xfd, 0x9a, 0x33, 0x54, 0x63, 0x4, 0xad, 0xca, 0xe2, 0x85, 0x2c, 0x4b, 0x42, 0x25, 0x8c, 0xeb, 0xc3, 0xa4, 0xd, 0x6a, 0x5d, 0x3a, 0x93, 0xf4, 0xdc, 0xbb, 0x12, 0x75, 0xf8, 0x9f, 0x36, 0x51, 0x79, 0x1e, 0xb7, 0xd0, 0xe7, 0x80, 0x29, 0x4e, 0x66, 0x1, 0xa8, 0xcf, 0xc6, 0xa1, 0x8, 0x6f, 0x47, 0x20, 0x89, 0xee, 0xd9, 0xbe, 0x17, 0x70, 0x58, 0x3f, 0x96, 0xf1, 0x84, 0xe3, 0x4a, 0x2d, 0x5, 0x62, 0xcb, 0xac, 0x9b, 0xfc, 0x55, 0x32, 0x1a, 0x7d, 0xd4, 0xb3, 0xba, 0xdd, 0x74, 0x13, 0x3b, 0x5c, 0xf5, 0x92, 0xa5, 0xc2, 0x6b, 0xc, 0x24, 0x43, 0xea, 0x8d, 0xed, 0x8a, 0x23, 0x44, 0x6c, 0xb, 0xa2, 0xc5, 0xf2, 0x95, 0x3c, 0x5b, 0x73, 0x14, 0xbd, 0xda, 0xd3, 0xb4, 0x1d, 0x7a, 0x52, 0x35, 0x9c, 0xfb, 0xcc, 0xab, 0x2, 0x65, 0x4d, 0x2a, 0x83, 0xe4, 0x91, 0xf6, 0x5f, 0x38, 0x10, 0x77, 0xde, 0xb9, 0x8e, 0xe9, 0x40, 0x27, 0xf, 0x68, 0xc1, 0xa6, 0xaf, 0xc8, 0x61, 0x6, 0x2e, 0x49, 0xe0, 0x87, 0xb0, 0xd7, 0x7e, 0x19, 0x31, 0x56, 0xff, 0x98, 0x15, 0x72, 0xdb, 0xbc, 0x94, 0xf3, 0x5a, 0x3d, 0xa, 0x6d, 0xc4, 0xa3, 0x8b, 0xec, 0x45, 0x22, 0x2b, 0x4c, 0xe5, 0x82, 0xaa, 0xcd, 0x64, 0x3, 0x34, 0x53, 0xfa, 0x9d, 0xb5, 0xd2, 0x7b, 0x1c, 0x69, 0xe, 0xa7, 0xc0, 0xe8, 0x8f, 0x26, 0x41, 0x76, 0x11, 0xb8, 0xdf, 0xf7, 0x90, 0x39, 0x5e, 0x57, 0x30, 0x99, 0xfe, 0xd6, 0xb1, 0x18, 0x7f, 0x48, 0x2f, 0x86, 0xe1, 0xc9, 0xae, 0x7, 0x60},
+ {0x0, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x5, 0x67, 0xf, 0xb7, 0xdf, 0xda, 0xb2, 0xa, 0x62, 0xce, 0xa6, 0x1e, 0x76, 0x73, 0x1b, 0xa3, 0xcb, 0xa9, 0xc1, 0x79, 0x11, 0x14, 0x7c, 0xc4, 0xac, 0x81, 0xe9, 0x51, 0x39, 0x3c, 0x54, 0xec, 0x84, 0xe6, 0x8e, 0x36, 0x5e, 0x5b, 0x33, 0x8b, 0xe3, 0x4f, 0x27, 0x9f, 0xf7, 0xf2, 0x9a, 0x22, 0x4a, 0x28, 0x40, 0xf8, 0x90, 0x95, 0xfd, 0x45, 0x2d, 0x1f, 0x77, 0xcf, 0xa7, 0xa2, 0xca, 0x72, 0x1a, 0x78, 0x10, 0xa8, 0xc0, 0xc5, 0xad, 0x15, 0x7d, 0xd1, 0xb9, 0x1, 0x69, 0x6c, 0x4, 0xbc, 0xd4, 0xb6, 0xde, 0x66, 0xe, 0xb, 0x63, 0xdb, 0xb3, 0x9e, 0xf6, 0x4e, 0x26, 0x23, 0x4b, 0xf3, 0x9b, 0xf9, 0x91, 0x29, 0x41, 0x44, 0x2c, 0x94, 0xfc, 0x50, 0x38, 0x80, 0xe8, 0xed, 0x85, 0x3d, 0x55, 0x37, 0x5f, 0xe7, 0x8f, 0x8a, 0xe2, 0x5a, 0x32, 0x3e, 0x56, 0xee, 0x86, 0x83, 0xeb, 0x53, 0x3b, 0x59, 0x31, 0x89, 0xe1, 0xe4, 0x8c, 0x34, 0x5c, 0xf0, 0x98, 0x20, 0x48, 0x4d, 0x25, 0x9d, 0xf5, 0x97, 0xff, 0x47, 0x2f, 0x2a, 0x42, 0xfa, 0x92, 0xbf, 0xd7, 0x6f, 0x7, 0x2, 0x6a, 0xd2, 0xba, 0xd8, 0xb0, 0x8, 0x60, 0x65, 0xd, 0xb5, 0xdd, 0x71, 0x19, 0xa1, 0xc9, 0xcc, 0xa4, 0x1c, 0x74, 0x16, 0x7e, 0xc6, 0xae, 0xab, 0xc3, 0x7b, 0x13, 0x21, 0x49, 0xf1, 0x99, 0x9c, 0xf4, 0x4c, 0x24, 0x46, 0x2e, 0x96, 0xfe, 0xfb, 0x93, 0x2b, 0x43, 0xef, 0x87, 0x3f, 0x57, 0x52, 0x3a, 0x82, 0xea, 0x88, 0xe0, 0x58, 0x30, 0x35, 0x5d, 0xe5, 0x8d, 0xa0, 0xc8, 0x70, 0x18, 0x1d, 0x75, 0xcd, 0xa5, 0xc7, 0xaf, 0x17, 0x7f, 0x7a, 0x12, 0xaa, 0xc2, 0x6e, 0x6, 0xbe, 0xd6, 0xd3, 0xbb, 0x3, 0x6b, 0x9, 0x61, 0xd9, 0xb1, 0xb4, 0xdc, 0x64, 0xc},
+ {0x0, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x2, 0x6f, 0x6, 0xbd, 0xd4, 0xd6, 0xbf, 0x4, 0x6d, 0xde, 0xb7, 0xc, 0x65, 0x67, 0xe, 0xb5, 0xdc, 0xb1, 0xd8, 0x63, 0xa, 0x8, 0x61, 0xda, 0xb3, 0xa1, 0xc8, 0x73, 0x1a, 0x18, 0x71, 0xca, 0xa3, 0xce, 0xa7, 0x1c, 0x75, 0x77, 0x1e, 0xa5, 0xcc, 0x7f, 0x16, 0xad, 0xc4, 0xc6, 0xaf, 0x14, 0x7d, 0x10, 0x79, 0xc2, 0xab, 0xa9, 0xc0, 0x7b, 0x12, 0x5f, 0x36, 0x8d, 0xe4, 0xe6, 0x8f, 0x34, 0x5d, 0x30, 0x59, 0xe2, 0x8b, 0x89, 0xe0, 0x5b, 0x32, 0x81, 0xe8, 0x53, 0x3a, 0x38, 0x51, 0xea, 0x83, 0xee, 0x87, 0x3c, 0x55, 0x57, 0x3e, 0x85, 0xec, 0xfe, 0x97, 0x2c, 0x45, 0x47, 0x2e, 0x95, 0xfc, 0x91, 0xf8, 0x43, 0x2a, 0x28, 0x41, 0xfa, 0x93, 0x20, 0x49, 0xf2, 0x9b, 0x99, 0xf0, 0x4b, 0x22, 0x4f, 0x26, 0x9d, 0xf4, 0xf6, 0x9f, 0x24, 0x4d, 0xbe, 0xd7, 0x6c, 0x5, 0x7, 0x6e, 0xd5, 0xbc, 0xd1, 0xb8, 0x3, 0x6a, 0x68, 0x1, 0xba, 0xd3, 0x60, 0x9, 0xb2, 0xdb, 0xd9, 0xb0, 0xb, 0x62, 0xf, 0x66, 0xdd, 0xb4, 0xb6, 0xdf, 0x64, 0xd, 0x1f, 0x76, 0xcd, 0xa4, 0xa6, 0xcf, 0x74, 0x1d, 0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72, 0xc1, 0xa8, 0x13, 0x7a, 0x78, 0x11, 0xaa, 0xc3, 0xae, 0xc7, 0x7c, 0x15, 0x17, 0x7e, 0xc5, 0xac, 0xe1, 0x88, 0x33, 0x5a, 0x58, 0x31, 0x8a, 0xe3, 0x8e, 0xe7, 0x5c, 0x35, 0x37, 0x5e, 0xe5, 0x8c, 0x3f, 0x56, 0xed, 0x84, 0x86, 0xef, 0x54, 0x3d, 0x50, 0x39, 0x82, 0xeb, 0xe9, 0x80, 0x3b, 0x52, 0x40, 0x29, 0x92, 0xfb, 0xf9, 0x90, 0x2b, 0x42, 0x2f, 0x46, 0xfd, 0x94, 0x96, 0xff, 0x44, 0x2d, 0x9e, 0xf7, 0x4c, 0x25, 0x27, 0x4e, 0xf5, 0x9c, 0xf1, 0x98, 0x23, 0x4a, 0x48, 0x21, 0x9a, 0xf3},
+ {0x0, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0xb, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c, 0xee, 0x84, 0x3a, 0x50, 0x5b, 0x31, 0x8f, 0xe5, 0x99, 0xf3, 0x4d, 0x27, 0x2c, 0x46, 0xf8, 0x92, 0xc1, 0xab, 0x15, 0x7f, 0x74, 0x1e, 0xa0, 0xca, 0xb6, 0xdc, 0x62, 0x8, 0x3, 0x69, 0xd7, 0xbd, 0x2f, 0x45, 0xfb, 0x91, 0x9a, 0xf0, 0x4e, 0x24, 0x58, 0x32, 0x8c, 0xe6, 0xed, 0x87, 0x39, 0x53, 0x9f, 0xf5, 0x4b, 0x21, 0x2a, 0x40, 0xfe, 0x94, 0xe8, 0x82, 0x3c, 0x56, 0x5d, 0x37, 0x89, 0xe3, 0x71, 0x1b, 0xa5, 0xcf, 0xc4, 0xae, 0x10, 0x7a, 0x6, 0x6c, 0xd2, 0xb8, 0xb3, 0xd9, 0x67, 0xd, 0x5e, 0x34, 0x8a, 0xe0, 0xeb, 0x81, 0x3f, 0x55, 0x29, 0x43, 0xfd, 0x97, 0x9c, 0xf6, 0x48, 0x22, 0xb0, 0xda, 0x64, 0xe, 0x5, 0x6f, 0xd1, 0xbb, 0xc7, 0xad, 0x13, 0x79, 0x72, 0x18, 0xa6, 0xcc, 0x23, 0x49, 0xf7, 0x9d, 0x96, 0xfc, 0x42, 0x28, 0x54, 0x3e, 0x80, 0xea, 0xe1, 0x8b, 0x35, 0x5f, 0xcd, 0xa7, 0x19, 0x73, 0x78, 0x12, 0xac, 0xc6, 0xba, 0xd0, 0x6e, 0x4, 0xf, 0x65, 0xdb, 0xb1, 0xe2, 0x88, 0x36, 0x5c, 0x57, 0x3d, 0x83, 0xe9, 0x95, 0xff, 0x41, 0x2b, 0x20, 0x4a, 0xf4, 0x9e, 0xc, 0x66, 0xd8, 0xb2, 0xb9, 0xd3, 0x6d, 0x7, 0x7b, 0x11, 0xaf, 0xc5, 0xce, 0xa4, 0x1a, 0x70, 0xbc, 0xd6, 0x68, 0x2, 0x9, 0x63, 0xdd, 0xb7, 0xcb, 0xa1, 0x1f, 0x75, 0x7e, 0x14, 0xaa, 0xc0, 0x52, 0x38, 0x86, 0xec, 0xe7, 0x8d, 0x33, 0x59, 0x25, 0x4f, 0xf1, 0x9b, 0x90, 0xfa, 0x44, 0x2e, 0x7d, 0x17, 0xa9, 0xc3, 0xc8, 0xa2, 0x1c, 0x76, 0xa, 0x60, 0xde, 0xb4, 0xbf, 0xd5, 0x6b, 0x1, 0x93, 0xf9, 0x47, 0x2d, 0x26, 0x4c, 0xf2, 0x98, 0xe4, 0x8e, 0x30, 0x5a, 0x51, 0x3b, 0x85, 0xef},
+ {0x0, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0xc, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73, 0xfe, 0x95, 0x28, 0x43, 0x4f, 0x24, 0x99, 0xf2, 0x81, 0xea, 0x57, 0x3c, 0x30, 0x5b, 0xe6, 0x8d, 0xe1, 0x8a, 0x37, 0x5c, 0x50, 0x3b, 0x86, 0xed, 0x9e, 0xf5, 0x48, 0x23, 0x2f, 0x44, 0xf9, 0x92, 0x1f, 0x74, 0xc9, 0xa2, 0xae, 0xc5, 0x78, 0x13, 0x60, 0xb, 0xb6, 0xdd, 0xd1, 0xba, 0x7, 0x6c, 0xdf, 0xb4, 0x9, 0x62, 0x6e, 0x5, 0xb8, 0xd3, 0xa0, 0xcb, 0x76, 0x1d, 0x11, 0x7a, 0xc7, 0xac, 0x21, 0x4a, 0xf7, 0x9c, 0x90, 0xfb, 0x46, 0x2d, 0x5e, 0x35, 0x88, 0xe3, 0xef, 0x84, 0x39, 0x52, 0x3e, 0x55, 0xe8, 0x83, 0x8f, 0xe4, 0x59, 0x32, 0x41, 0x2a, 0x97, 0xfc, 0xf0, 0x9b, 0x26, 0x4d, 0xc0, 0xab, 0x16, 0x7d, 0x71, 0x1a, 0xa7, 0xcc, 0xbf, 0xd4, 0x69, 0x2, 0xe, 0x65, 0xd8, 0xb3, 0xa3, 0xc8, 0x75, 0x1e, 0x12, 0x79, 0xc4, 0xaf, 0xdc, 0xb7, 0xa, 0x61, 0x6d, 0x6, 0xbb, 0xd0, 0x5d, 0x36, 0x8b, 0xe0, 0xec, 0x87, 0x3a, 0x51, 0x22, 0x49, 0xf4, 0x9f, 0x93, 0xf8, 0x45, 0x2e, 0x42, 0x29, 0x94, 0xff, 0xf3, 0x98, 0x25, 0x4e, 0x3d, 0x56, 0xeb, 0x80, 0x8c, 0xe7, 0x5a, 0x31, 0xbc, 0xd7, 0x6a, 0x1, 0xd, 0x66, 0xdb, 0xb0, 0xc3, 0xa8, 0x15, 0x7e, 0x72, 0x19, 0xa4, 0xcf, 0x7c, 0x17, 0xaa, 0xc1, 0xcd, 0xa6, 0x1b, 0x70, 0x3, 0x68, 0xd5, 0xbe, 0xb2, 0xd9, 0x64, 0xf, 0x82, 0xe9, 0x54, 0x3f, 0x33, 0x58, 0xe5, 0x8e, 0xfd, 0x96, 0x2b, 0x40, 0x4c, 0x27, 0x9a, 0xf1, 0x9d, 0xf6, 0x4b, 0x20, 0x2c, 0x47, 0xfa, 0x91, 0xe2, 0x89, 0x34, 0x5f, 0x53, 0x38, 0x85, 0xee, 0x63, 0x8, 0xb5, 0xde, 0xd2, 0xb9, 0x4, 0x6f, 0x1c, 0x77, 0xca, 0xa1, 0xad, 0xc6, 0x7b, 0x10},
+ {0x0, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e, 0x8e, 0xe2, 0x56, 0x3a, 0x23, 0x4f, 0xfb, 0x97, 0xc9, 0xa5, 0x11, 0x7d, 0x64, 0x8, 0xbc, 0xd0, 0x1, 0x6d, 0xd9, 0xb5, 0xac, 0xc0, 0x74, 0x18, 0x46, 0x2a, 0x9e, 0xf2, 0xeb, 0x87, 0x33, 0x5f, 0x8f, 0xe3, 0x57, 0x3b, 0x22, 0x4e, 0xfa, 0x96, 0xc8, 0xa4, 0x10, 0x7c, 0x65, 0x9, 0xbd, 0xd1, 0x2, 0x6e, 0xda, 0xb6, 0xaf, 0xc3, 0x77, 0x1b, 0x45, 0x29, 0x9d, 0xf1, 0xe8, 0x84, 0x30, 0x5c, 0x8c, 0xe0, 0x54, 0x38, 0x21, 0x4d, 0xf9, 0x95, 0xcb, 0xa7, 0x13, 0x7f, 0x66, 0xa, 0xbe, 0xd2, 0x3, 0x6f, 0xdb, 0xb7, 0xae, 0xc2, 0x76, 0x1a, 0x44, 0x28, 0x9c, 0xf0, 0xe9, 0x85, 0x31, 0x5d, 0x8d, 0xe1, 0x55, 0x39, 0x20, 0x4c, 0xf8, 0x94, 0xca, 0xa6, 0x12, 0x7e, 0x67, 0xb, 0xbf, 0xd3, 0x4, 0x68, 0xdc, 0xb0, 0xa9, 0xc5, 0x71, 0x1d, 0x43, 0x2f, 0x9b, 0xf7, 0xee, 0x82, 0x36, 0x5a, 0x8a, 0xe6, 0x52, 0x3e, 0x27, 0x4b, 0xff, 0x93, 0xcd, 0xa1, 0x15, 0x79, 0x60, 0xc, 0xb8, 0xd4, 0x5, 0x69, 0xdd, 0xb1, 0xa8, 0xc4, 0x70, 0x1c, 0x42, 0x2e, 0x9a, 0xf6, 0xef, 0x83, 0x37, 0x5b, 0x8b, 0xe7, 0x53, 0x3f, 0x26, 0x4a, 0xfe, 0x92, 0xcc, 0xa0, 0x14, 0x78, 0x61, 0xd, 0xb9, 0xd5, 0x6, 0x6a, 0xde, 0xb2, 0xab, 0xc7, 0x73, 0x1f, 0x41, 0x2d, 0x99, 0xf5, 0xec, 0x80, 0x34, 0x58, 0x88, 0xe4, 0x50, 0x3c, 0x25, 0x49, 0xfd, 0x91, 0xcf, 0xa3, 0x17, 0x7b, 0x62, 0xe, 0xba, 0xd6, 0x7, 0x6b, 0xdf, 0xb3, 0xaa, 0xc6, 0x72, 0x1e, 0x40, 0x2c, 0x98, 0xf4, 0xed, 0x81, 0x35, 0x59, 0x89, 0xe5, 0x51, 0x3d, 0x24, 0x48, 0xfc, 0x90, 0xce, 0xa2, 0x16, 0x7a, 0x63, 0xf, 0xbb, 0xd7},
+ {0x0, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51, 0x9e, 0xf3, 0x44, 0x29, 0x37, 0x5a, 0xed, 0x80, 0xd1, 0xbc, 0xb, 0x66, 0x78, 0x15, 0xa2, 0xcf, 0x21, 0x4c, 0xfb, 0x96, 0x88, 0xe5, 0x52, 0x3f, 0x6e, 0x3, 0xb4, 0xd9, 0xc7, 0xaa, 0x1d, 0x70, 0xbf, 0xd2, 0x65, 0x8, 0x16, 0x7b, 0xcc, 0xa1, 0xf0, 0x9d, 0x2a, 0x47, 0x59, 0x34, 0x83, 0xee, 0x42, 0x2f, 0x98, 0xf5, 0xeb, 0x86, 0x31, 0x5c, 0xd, 0x60, 0xd7, 0xba, 0xa4, 0xc9, 0x7e, 0x13, 0xdc, 0xb1, 0x6, 0x6b, 0x75, 0x18, 0xaf, 0xc2, 0x93, 0xfe, 0x49, 0x24, 0x3a, 0x57, 0xe0, 0x8d, 0x63, 0xe, 0xb9, 0xd4, 0xca, 0xa7, 0x10, 0x7d, 0x2c, 0x41, 0xf6, 0x9b, 0x85, 0xe8, 0x5f, 0x32, 0xfd, 0x90, 0x27, 0x4a, 0x54, 0x39, 0x8e, 0xe3, 0xb2, 0xdf, 0x68, 0x5, 0x1b, 0x76, 0xc1, 0xac, 0x84, 0xe9, 0x5e, 0x33, 0x2d, 0x40, 0xf7, 0x9a, 0xcb, 0xa6, 0x11, 0x7c, 0x62, 0xf, 0xb8, 0xd5, 0x1a, 0x77, 0xc0, 0xad, 0xb3, 0xde, 0x69, 0x4, 0x55, 0x38, 0x8f, 0xe2, 0xfc, 0x91, 0x26, 0x4b, 0xa5, 0xc8, 0x7f, 0x12, 0xc, 0x61, 0xd6, 0xbb, 0xea, 0x87, 0x30, 0x5d, 0x43, 0x2e, 0x99, 0xf4, 0x3b, 0x56, 0xe1, 0x8c, 0x92, 0xff, 0x48, 0x25, 0x74, 0x19, 0xae, 0xc3, 0xdd, 0xb0, 0x7, 0x6a, 0xc6, 0xab, 0x1c, 0x71, 0x6f, 0x2, 0xb5, 0xd8, 0x89, 0xe4, 0x53, 0x3e, 0x20, 0x4d, 0xfa, 0x97, 0x58, 0x35, 0x82, 0xef, 0xf1, 0x9c, 0x2b, 0x46, 0x17, 0x7a, 0xcd, 0xa0, 0xbe, 0xd3, 0x64, 0x9, 0xe7, 0x8a, 0x3d, 0x50, 0x4e, 0x23, 0x94, 0xf9, 0xa8, 0xc5, 0x72, 0x1f, 0x1, 0x6c, 0xdb, 0xb6, 0x79, 0x14, 0xa3, 0xce, 0xd0, 0xbd, 0xa, 0x67, 0x36, 0x5b, 0xec, 0x81, 0x9f, 0xf2, 0x45, 0x28},
+ {0x0, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40, 0xae, 0xc0, 0x72, 0x1c, 0xb, 0x65, 0xd7, 0xb9, 0xf9, 0x97, 0x25, 0x4b, 0x5c, 0x32, 0x80, 0xee, 0x41, 0x2f, 0x9d, 0xf3, 0xe4, 0x8a, 0x38, 0x56, 0x16, 0x78, 0xca, 0xa4, 0xb3, 0xdd, 0x6f, 0x1, 0xef, 0x81, 0x33, 0x5d, 0x4a, 0x24, 0x96, 0xf8, 0xb8, 0xd6, 0x64, 0xa, 0x1d, 0x73, 0xc1, 0xaf, 0x82, 0xec, 0x5e, 0x30, 0x27, 0x49, 0xfb, 0x95, 0xd5, 0xbb, 0x9, 0x67, 0x70, 0x1e, 0xac, 0xc2, 0x2c, 0x42, 0xf0, 0x9e, 0x89, 0xe7, 0x55, 0x3b, 0x7b, 0x15, 0xa7, 0xc9, 0xde, 0xb0, 0x2, 0x6c, 0xc3, 0xad, 0x1f, 0x71, 0x66, 0x8, 0xba, 0xd4, 0x94, 0xfa, 0x48, 0x26, 0x31, 0x5f, 0xed, 0x83, 0x6d, 0x3, 0xb1, 0xdf, 0xc8, 0xa6, 0x14, 0x7a, 0x3a, 0x54, 0xe6, 0x88, 0x9f, 0xf1, 0x43, 0x2d, 0x19, 0x77, 0xc5, 0xab, 0xbc, 0xd2, 0x60, 0xe, 0x4e, 0x20, 0x92, 0xfc, 0xeb, 0x85, 0x37, 0x59, 0xb7, 0xd9, 0x6b, 0x5, 0x12, 0x7c, 0xce, 0xa0, 0xe0, 0x8e, 0x3c, 0x52, 0x45, 0x2b, 0x99, 0xf7, 0x58, 0x36, 0x84, 0xea, 0xfd, 0x93, 0x21, 0x4f, 0xf, 0x61, 0xd3, 0xbd, 0xaa, 0xc4, 0x76, 0x18, 0xf6, 0x98, 0x2a, 0x44, 0x53, 0x3d, 0x8f, 0xe1, 0xa1, 0xcf, 0x7d, 0x13, 0x4, 0x6a, 0xd8, 0xb6, 0x9b, 0xf5, 0x47, 0x29, 0x3e, 0x50, 0xe2, 0x8c, 0xcc, 0xa2, 0x10, 0x7e, 0x69, 0x7, 0xb5, 0xdb, 0x35, 0x5b, 0xe9, 0x87, 0x90, 0xfe, 0x4c, 0x22, 0x62, 0xc, 0xbe, 0xd0, 0xc7, 0xa9, 0x1b, 0x75, 0xda, 0xb4, 0x6, 0x68, 0x7f, 0x11, 0xa3, 0xcd, 0x8d, 0xe3, 0x51, 0x3f, 0x28, 0x46, 0xf4, 0x9a, 0x74, 0x1a, 0xa8, 0xc6, 0xd1, 0xbf, 0xd, 0x63, 0x23, 0x4d, 0xff, 0x91, 0x86, 0xe8, 0x5a, 0x34},
+ {0x0, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f, 0xbe, 0xd1, 0x60, 0xf, 0x1f, 0x70, 0xc1, 0xae, 0xe1, 0x8e, 0x3f, 0x50, 0x40, 0x2f, 0x9e, 0xf1, 0x61, 0xe, 0xbf, 0xd0, 0xc0, 0xaf, 0x1e, 0x71, 0x3e, 0x51, 0xe0, 0x8f, 0x9f, 0xf0, 0x41, 0x2e, 0xdf, 0xb0, 0x1, 0x6e, 0x7e, 0x11, 0xa0, 0xcf, 0x80, 0xef, 0x5e, 0x31, 0x21, 0x4e, 0xff, 0x90, 0xc2, 0xad, 0x1c, 0x73, 0x63, 0xc, 0xbd, 0xd2, 0x9d, 0xf2, 0x43, 0x2c, 0x3c, 0x53, 0xe2, 0x8d, 0x7c, 0x13, 0xa2, 0xcd, 0xdd, 0xb2, 0x3, 0x6c, 0x23, 0x4c, 0xfd, 0x92, 0x82, 0xed, 0x5c, 0x33, 0xa3, 0xcc, 0x7d, 0x12, 0x2, 0x6d, 0xdc, 0xb3, 0xfc, 0x93, 0x22, 0x4d, 0x5d, 0x32, 0x83, 0xec, 0x1d, 0x72, 0xc3, 0xac, 0xbc, 0xd3, 0x62, 0xd, 0x42, 0x2d, 0x9c, 0xf3, 0xe3, 0x8c, 0x3d, 0x52, 0x99, 0xf6, 0x47, 0x28, 0x38, 0x57, 0xe6, 0x89, 0xc6, 0xa9, 0x18, 0x77, 0x67, 0x8, 0xb9, 0xd6, 0x27, 0x48, 0xf9, 0x96, 0x86, 0xe9, 0x58, 0x37, 0x78, 0x17, 0xa6, 0xc9, 0xd9, 0xb6, 0x7, 0x68, 0xf8, 0x97, 0x26, 0x49, 0x59, 0x36, 0x87, 0xe8, 0xa7, 0xc8, 0x79, 0x16, 0x6, 0x69, 0xd8, 0xb7, 0x46, 0x29, 0x98, 0xf7, 0xe7, 0x88, 0x39, 0x56, 0x19, 0x76, 0xc7, 0xa8, 0xb8, 0xd7, 0x66, 0x9, 0x5b, 0x34, 0x85, 0xea, 0xfa, 0x95, 0x24, 0x4b, 0x4, 0x6b, 0xda, 0xb5, 0xa5, 0xca, 0x7b, 0x14, 0xe5, 0x8a, 0x3b, 0x54, 0x44, 0x2b, 0x9a, 0xf5, 0xba, 0xd5, 0x64, 0xb, 0x1b, 0x74, 0xc5, 0xaa, 0x3a, 0x55, 0xe4, 0x8b, 0x9b, 0xf4, 0x45, 0x2a, 0x65, 0xa, 0xbb, 0xd4, 0xc4, 0xab, 0x1a, 0x75, 0x84, 0xeb, 0x5a, 0x35, 0x25, 0x4a, 0xfb, 0x94, 0xdb, 0xb4, 0x5, 0x6a, 0x7a, 0x15, 0xa4, 0xcb},
+ {0x0, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0xa, 0x9a, 0xea, 0x53, 0x23, 0xb3, 0xc3, 0x8e, 0xfe, 0x6e, 0x1e, 0xf4, 0x84, 0x14, 0x64, 0x29, 0x59, 0xc9, 0xb9, 0xa6, 0xd6, 0x46, 0x36, 0x7b, 0xb, 0x9b, 0xeb, 0x1, 0x71, 0xe1, 0x91, 0xdc, 0xac, 0x3c, 0x4c, 0xf5, 0x85, 0x15, 0x65, 0x28, 0x58, 0xc8, 0xb8, 0x52, 0x22, 0xb2, 0xc2, 0x8f, 0xff, 0x6f, 0x1f, 0x51, 0x21, 0xb1, 0xc1, 0x8c, 0xfc, 0x6c, 0x1c, 0xf6, 0x86, 0x16, 0x66, 0x2b, 0x5b, 0xcb, 0xbb, 0x2, 0x72, 0xe2, 0x92, 0xdf, 0xaf, 0x3f, 0x4f, 0xa5, 0xd5, 0x45, 0x35, 0x78, 0x8, 0x98, 0xe8, 0xf7, 0x87, 0x17, 0x67, 0x2a, 0x5a, 0xca, 0xba, 0x50, 0x20, 0xb0, 0xc0, 0x8d, 0xfd, 0x6d, 0x1d, 0xa4, 0xd4, 0x44, 0x34, 0x79, 0x9, 0x99, 0xe9, 0x3, 0x73, 0xe3, 0x93, 0xde, 0xae, 0x3e, 0x4e, 0xa2, 0xd2, 0x42, 0x32, 0x7f, 0xf, 0x9f, 0xef, 0x5, 0x75, 0xe5, 0x95, 0xd8, 0xa8, 0x38, 0x48, 0xf1, 0x81, 0x11, 0x61, 0x2c, 0x5c, 0xcc, 0xbc, 0x56, 0x26, 0xb6, 0xc6, 0x8b, 0xfb, 0x6b, 0x1b, 0x4, 0x74, 0xe4, 0x94, 0xd9, 0xa9, 0x39, 0x49, 0xa3, 0xd3, 0x43, 0x33, 0x7e, 0xe, 0x9e, 0xee, 0x57, 0x27, 0xb7, 0xc7, 0x8a, 0xfa, 0x6a, 0x1a, 0xf0, 0x80, 0x10, 0x60, 0x2d, 0x5d, 0xcd, 0xbd, 0xf3, 0x83, 0x13, 0x63, 0x2e, 0x5e, 0xce, 0xbe, 0x54, 0x24, 0xb4, 0xc4, 0x89, 0xf9, 0x69, 0x19, 0xa0, 0xd0, 0x40, 0x30, 0x7d, 0xd, 0x9d, 0xed, 0x7, 0x77, 0xe7, 0x97, 0xda, 0xaa, 0x3a, 0x4a, 0x55, 0x25, 0xb5, 0xc5, 0x88, 0xf8, 0x68, 0x18, 0xf2, 0x82, 0x12, 0x62, 0x2f, 0x5f, 0xcf, 0xbf, 0x6, 0x76, 0xe6, 0x96, 0xdb, 0xab, 0x3b, 0x4b, 0xa1, 0xd1, 0x41, 0x31, 0x7c, 0xc, 0x9c, 0xec},
+ {0x0, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x7, 0x94, 0xe5, 0x43, 0x32, 0xa1, 0xd0, 0x9a, 0xeb, 0x78, 0x9, 0xec, 0x9d, 0xe, 0x7f, 0x35, 0x44, 0xd7, 0xa6, 0x86, 0xf7, 0x64, 0x15, 0x5f, 0x2e, 0xbd, 0xcc, 0x29, 0x58, 0xcb, 0xba, 0xf0, 0x81, 0x12, 0x63, 0xc5, 0xb4, 0x27, 0x56, 0x1c, 0x6d, 0xfe, 0x8f, 0x6a, 0x1b, 0x88, 0xf9, 0xb3, 0xc2, 0x51, 0x20, 0x11, 0x60, 0xf3, 0x82, 0xc8, 0xb9, 0x2a, 0x5b, 0xbe, 0xcf, 0x5c, 0x2d, 0x67, 0x16, 0x85, 0xf4, 0x52, 0x23, 0xb0, 0xc1, 0x8b, 0xfa, 0x69, 0x18, 0xfd, 0x8c, 0x1f, 0x6e, 0x24, 0x55, 0xc6, 0xb7, 0x97, 0xe6, 0x75, 0x4, 0x4e, 0x3f, 0xac, 0xdd, 0x38, 0x49, 0xda, 0xab, 0xe1, 0x90, 0x3, 0x72, 0xd4, 0xa5, 0x36, 0x47, 0xd, 0x7c, 0xef, 0x9e, 0x7b, 0xa, 0x99, 0xe8, 0xa2, 0xd3, 0x40, 0x31, 0x22, 0x53, 0xc0, 0xb1, 0xfb, 0x8a, 0x19, 0x68, 0x8d, 0xfc, 0x6f, 0x1e, 0x54, 0x25, 0xb6, 0xc7, 0x61, 0x10, 0x83, 0xf2, 0xb8, 0xc9, 0x5a, 0x2b, 0xce, 0xbf, 0x2c, 0x5d, 0x17, 0x66, 0xf5, 0x84, 0xa4, 0xd5, 0x46, 0x37, 0x7d, 0xc, 0x9f, 0xee, 0xb, 0x7a, 0xe9, 0x98, 0xd2, 0xa3, 0x30, 0x41, 0xe7, 0x96, 0x5, 0x74, 0x3e, 0x4f, 0xdc, 0xad, 0x48, 0x39, 0xaa, 0xdb, 0x91, 0xe0, 0x73, 0x2, 0x33, 0x42, 0xd1, 0xa0, 0xea, 0x9b, 0x8, 0x79, 0x9c, 0xed, 0x7e, 0xf, 0x45, 0x34, 0xa7, 0xd6, 0x70, 0x1, 0x92, 0xe3, 0xa9, 0xd8, 0x4b, 0x3a, 0xdf, 0xae, 0x3d, 0x4c, 0x6, 0x77, 0xe4, 0x95, 0xb5, 0xc4, 0x57, 0x26, 0x6c, 0x1d, 0x8e, 0xff, 0x1a, 0x6b, 0xf8, 0x89, 0xc3, 0xb2, 0x21, 0x50, 0xf6, 0x87, 0x14, 0x65, 0x2f, 0x5e, 0xcd, 0xbc, 0x59, 0x28, 0xbb, 0xca, 0x80, 0xf1, 0x62, 0x13},
+ {0x0, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4, 0x73, 0x1, 0x97, 0xe5, 0xa6, 0xd4, 0x42, 0x30, 0xc4, 0xb6, 0x20, 0x52, 0x11, 0x63, 0xf5, 0x87, 0xe6, 0x94, 0x2, 0x70, 0x33, 0x41, 0xd7, 0xa5, 0x51, 0x23, 0xb5, 0xc7, 0x84, 0xf6, 0x60, 0x12, 0x95, 0xe7, 0x71, 0x3, 0x40, 0x32, 0xa4, 0xd6, 0x22, 0x50, 0xc6, 0xb4, 0xf7, 0x85, 0x13, 0x61, 0xd1, 0xa3, 0x35, 0x47, 0x4, 0x76, 0xe0, 0x92, 0x66, 0x14, 0x82, 0xf0, 0xb3, 0xc1, 0x57, 0x25, 0xa2, 0xd0, 0x46, 0x34, 0x77, 0x5, 0x93, 0xe1, 0x15, 0x67, 0xf1, 0x83, 0xc0, 0xb2, 0x24, 0x56, 0x37, 0x45, 0xd3, 0xa1, 0xe2, 0x90, 0x6, 0x74, 0x80, 0xf2, 0x64, 0x16, 0x55, 0x27, 0xb1, 0xc3, 0x44, 0x36, 0xa0, 0xd2, 0x91, 0xe3, 0x75, 0x7, 0xf3, 0x81, 0x17, 0x65, 0x26, 0x54, 0xc2, 0xb0, 0xbf, 0xcd, 0x5b, 0x29, 0x6a, 0x18, 0x8e, 0xfc, 0x8, 0x7a, 0xec, 0x9e, 0xdd, 0xaf, 0x39, 0x4b, 0xcc, 0xbe, 0x28, 0x5a, 0x19, 0x6b, 0xfd, 0x8f, 0x7b, 0x9, 0x9f, 0xed, 0xae, 0xdc, 0x4a, 0x38, 0x59, 0x2b, 0xbd, 0xcf, 0x8c, 0xfe, 0x68, 0x1a, 0xee, 0x9c, 0xa, 0x78, 0x3b, 0x49, 0xdf, 0xad, 0x2a, 0x58, 0xce, 0xbc, 0xff, 0x8d, 0x1b, 0x69, 0x9d, 0xef, 0x79, 0xb, 0x48, 0x3a, 0xac, 0xde, 0x6e, 0x1c, 0x8a, 0xf8, 0xbb, 0xc9, 0x5f, 0x2d, 0xd9, 0xab, 0x3d, 0x4f, 0xc, 0x7e, 0xe8, 0x9a, 0x1d, 0x6f, 0xf9, 0x8b, 0xc8, 0xba, 0x2c, 0x5e, 0xaa, 0xd8, 0x4e, 0x3c, 0x7f, 0xd, 0x9b, 0xe9, 0x88, 0xfa, 0x6c, 0x1e, 0x5d, 0x2f, 0xb9, 0xcb, 0x3f, 0x4d, 0xdb, 0xa9, 0xea, 0x98, 0xe, 0x7c, 0xfb, 0x89, 0x1f, 0x6d, 0x2e, 0x5c, 0xca, 0xb8, 0x4c, 0x3e, 0xa8, 0xda, 0x99, 0xeb, 0x7d, 0xf},
+ {0x0, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb, 0x63, 0x10, 0x85, 0xf6, 0xb2, 0xc1, 0x54, 0x27, 0xdc, 0xaf, 0x3a, 0x49, 0xd, 0x7e, 0xeb, 0x98, 0xc6, 0xb5, 0x20, 0x53, 0x17, 0x64, 0xf1, 0x82, 0x79, 0xa, 0x9f, 0xec, 0xa8, 0xdb, 0x4e, 0x3d, 0xa5, 0xd6, 0x43, 0x30, 0x74, 0x7, 0x92, 0xe1, 0x1a, 0x69, 0xfc, 0x8f, 0xcb, 0xb8, 0x2d, 0x5e, 0x91, 0xe2, 0x77, 0x4, 0x40, 0x33, 0xa6, 0xd5, 0x2e, 0x5d, 0xc8, 0xbb, 0xff, 0x8c, 0x19, 0x6a, 0xf2, 0x81, 0x14, 0x67, 0x23, 0x50, 0xc5, 0xb6, 0x4d, 0x3e, 0xab, 0xd8, 0x9c, 0xef, 0x7a, 0x9, 0x57, 0x24, 0xb1, 0xc2, 0x86, 0xf5, 0x60, 0x13, 0xe8, 0x9b, 0xe, 0x7d, 0x39, 0x4a, 0xdf, 0xac, 0x34, 0x47, 0xd2, 0xa1, 0xe5, 0x96, 0x3, 0x70, 0x8b, 0xf8, 0x6d, 0x1e, 0x5a, 0x29, 0xbc, 0xcf, 0x3f, 0x4c, 0xd9, 0xaa, 0xee, 0x9d, 0x8, 0x7b, 0x80, 0xf3, 0x66, 0x15, 0x51, 0x22, 0xb7, 0xc4, 0x5c, 0x2f, 0xba, 0xc9, 0x8d, 0xfe, 0x6b, 0x18, 0xe3, 0x90, 0x5, 0x76, 0x32, 0x41, 0xd4, 0xa7, 0xf9, 0x8a, 0x1f, 0x6c, 0x28, 0x5b, 0xce, 0xbd, 0x46, 0x35, 0xa0, 0xd3, 0x97, 0xe4, 0x71, 0x2, 0x9a, 0xe9, 0x7c, 0xf, 0x4b, 0x38, 0xad, 0xde, 0x25, 0x56, 0xc3, 0xb0, 0xf4, 0x87, 0x12, 0x61, 0xae, 0xdd, 0x48, 0x3b, 0x7f, 0xc, 0x99, 0xea, 0x11, 0x62, 0xf7, 0x84, 0xc0, 0xb3, 0x26, 0x55, 0xcd, 0xbe, 0x2b, 0x58, 0x1c, 0x6f, 0xfa, 0x89, 0x72, 0x1, 0x94, 0xe7, 0xa3, 0xd0, 0x45, 0x36, 0x68, 0x1b, 0x8e, 0xfd, 0xb9, 0xca, 0x5f, 0x2c, 0xd7, 0xa4, 0x31, 0x42, 0x6, 0x75, 0xe0, 0x93, 0xb, 0x78, 0xed, 0x9e, 0xda, 0xa9, 0x3c, 0x4f, 0xb4, 0xc7, 0x52, 0x21, 0x65, 0x16, 0x83, 0xf0},
+ {0x0, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6, 0x13, 0x67, 0xfb, 0x8f, 0xde, 0xaa, 0x36, 0x42, 0x94, 0xe0, 0x7c, 0x8, 0x59, 0x2d, 0xb1, 0xc5, 0x26, 0x52, 0xce, 0xba, 0xeb, 0x9f, 0x3, 0x77, 0xa1, 0xd5, 0x49, 0x3d, 0x6c, 0x18, 0x84, 0xf0, 0x35, 0x41, 0xdd, 0xa9, 0xf8, 0x8c, 0x10, 0x64, 0xb2, 0xc6, 0x5a, 0x2e, 0x7f, 0xb, 0x97, 0xe3, 0x4c, 0x38, 0xa4, 0xd0, 0x81, 0xf5, 0x69, 0x1d, 0xcb, 0xbf, 0x23, 0x57, 0x6, 0x72, 0xee, 0x9a, 0x5f, 0x2b, 0xb7, 0xc3, 0x92, 0xe6, 0x7a, 0xe, 0xd8, 0xac, 0x30, 0x44, 0x15, 0x61, 0xfd, 0x89, 0x6a, 0x1e, 0x82, 0xf6, 0xa7, 0xd3, 0x4f, 0x3b, 0xed, 0x99, 0x5, 0x71, 0x20, 0x54, 0xc8, 0xbc, 0x79, 0xd, 0x91, 0xe5, 0xb4, 0xc0, 0x5c, 0x28, 0xfe, 0x8a, 0x16, 0x62, 0x33, 0x47, 0xdb, 0xaf, 0x98, 0xec, 0x70, 0x4, 0x55, 0x21, 0xbd, 0xc9, 0x1f, 0x6b, 0xf7, 0x83, 0xd2, 0xa6, 0x3a, 0x4e, 0x8b, 0xff, 0x63, 0x17, 0x46, 0x32, 0xae, 0xda, 0xc, 0x78, 0xe4, 0x90, 0xc1, 0xb5, 0x29, 0x5d, 0xbe, 0xca, 0x56, 0x22, 0x73, 0x7, 0x9b, 0xef, 0x39, 0x4d, 0xd1, 0xa5, 0xf4, 0x80, 0x1c, 0x68, 0xad, 0xd9, 0x45, 0x31, 0x60, 0x14, 0x88, 0xfc, 0x2a, 0x5e, 0xc2, 0xb6, 0xe7, 0x93, 0xf, 0x7b, 0xd4, 0xa0, 0x3c, 0x48, 0x19, 0x6d, 0xf1, 0x85, 0x53, 0x27, 0xbb, 0xcf, 0x9e, 0xea, 0x76, 0x2, 0xc7, 0xb3, 0x2f, 0x5b, 0xa, 0x7e, 0xe2, 0x96, 0x40, 0x34, 0xa8, 0xdc, 0x8d, 0xf9, 0x65, 0x11, 0xf2, 0x86, 0x1a, 0x6e, 0x3f, 0x4b, 0xd7, 0xa3, 0x75, 0x1, 0x9d, 0xe9, 0xb8, 0xcc, 0x50, 0x24, 0xe1, 0x95, 0x9, 0x7d, 0x2c, 0x58, 0xc4, 0xb0, 0x66, 0x12, 0x8e, 0xfa, 0xab, 0xdf, 0x43, 0x37},
+ {0x0, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9, 0x3, 0x76, 0xe9, 0x9c, 0xca, 0xbf, 0x20, 0x55, 0x8c, 0xf9, 0x66, 0x13, 0x45, 0x30, 0xaf, 0xda, 0x6, 0x73, 0xec, 0x99, 0xcf, 0xba, 0x25, 0x50, 0x89, 0xfc, 0x63, 0x16, 0x40, 0x35, 0xaa, 0xdf, 0x5, 0x70, 0xef, 0x9a, 0xcc, 0xb9, 0x26, 0x53, 0x8a, 0xff, 0x60, 0x15, 0x43, 0x36, 0xa9, 0xdc, 0xc, 0x79, 0xe6, 0x93, 0xc5, 0xb0, 0x2f, 0x5a, 0x83, 0xf6, 0x69, 0x1c, 0x4a, 0x3f, 0xa0, 0xd5, 0xf, 0x7a, 0xe5, 0x90, 0xc6, 0xb3, 0x2c, 0x59, 0x80, 0xf5, 0x6a, 0x1f, 0x49, 0x3c, 0xa3, 0xd6, 0xa, 0x7f, 0xe0, 0x95, 0xc3, 0xb6, 0x29, 0x5c, 0x85, 0xf0, 0x6f, 0x1a, 0x4c, 0x39, 0xa6, 0xd3, 0x9, 0x7c, 0xe3, 0x96, 0xc0, 0xb5, 0x2a, 0x5f, 0x86, 0xf3, 0x6c, 0x19, 0x4f, 0x3a, 0xa5, 0xd0, 0x18, 0x6d, 0xf2, 0x87, 0xd1, 0xa4, 0x3b, 0x4e, 0x97, 0xe2, 0x7d, 0x8, 0x5e, 0x2b, 0xb4, 0xc1, 0x1b, 0x6e, 0xf1, 0x84, 0xd2, 0xa7, 0x38, 0x4d, 0x94, 0xe1, 0x7e, 0xb, 0x5d, 0x28, 0xb7, 0xc2, 0x1e, 0x6b, 0xf4, 0x81, 0xd7, 0xa2, 0x3d, 0x48, 0x91, 0xe4, 0x7b, 0xe, 0x58, 0x2d, 0xb2, 0xc7, 0x1d, 0x68, 0xf7, 0x82, 0xd4, 0xa1, 0x3e, 0x4b, 0x92, 0xe7, 0x78, 0xd, 0x5b, 0x2e, 0xb1, 0xc4, 0x14, 0x61, 0xfe, 0x8b, 0xdd, 0xa8, 0x37, 0x42, 0x9b, 0xee, 0x71, 0x4, 0x52, 0x27, 0xb8, 0xcd, 0x17, 0x62, 0xfd, 0x88, 0xde, 0xab, 0x34, 0x41, 0x98, 0xed, 0x72, 0x7, 0x51, 0x24, 0xbb, 0xce, 0x12, 0x67, 0xf8, 0x8d, 0xdb, 0xae, 0x31, 0x44, 0x9d, 0xe8, 0x77, 0x2, 0x54, 0x21, 0xbe, 0xcb, 0x11, 0x64, 0xfb, 0x8e, 0xd8, 0xad, 0x32, 0x47, 0x9e, 0xeb, 0x74, 0x1, 0x57, 0x22, 0xbd, 0xc8},
+ {0x0, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0xd, 0x52, 0x24, 0xbe, 0xc8, 0x33, 0x45, 0xdf, 0xa9, 0xf6, 0x80, 0x1a, 0x6c, 0xa4, 0xd2, 0x48, 0x3e, 0x61, 0x17, 0x8d, 0xfb, 0x66, 0x10, 0x8a, 0xfc, 0xa3, 0xd5, 0x4f, 0x39, 0xf1, 0x87, 0x1d, 0x6b, 0x34, 0x42, 0xd8, 0xae, 0x55, 0x23, 0xb9, 0xcf, 0x90, 0xe6, 0x7c, 0xa, 0xc2, 0xb4, 0x2e, 0x58, 0x7, 0x71, 0xeb, 0x9d, 0xcc, 0xba, 0x20, 0x56, 0x9, 0x7f, 0xe5, 0x93, 0x5b, 0x2d, 0xb7, 0xc1, 0x9e, 0xe8, 0x72, 0x4, 0xff, 0x89, 0x13, 0x65, 0x3a, 0x4c, 0xd6, 0xa0, 0x68, 0x1e, 0x84, 0xf2, 0xad, 0xdb, 0x41, 0x37, 0xaa, 0xdc, 0x46, 0x30, 0x6f, 0x19, 0x83, 0xf5, 0x3d, 0x4b, 0xd1, 0xa7, 0xf8, 0x8e, 0x14, 0x62, 0x99, 0xef, 0x75, 0x3, 0x5c, 0x2a, 0xb0, 0xc6, 0xe, 0x78, 0xe2, 0x94, 0xcb, 0xbd, 0x27, 0x51, 0x85, 0xf3, 0x69, 0x1f, 0x40, 0x36, 0xac, 0xda, 0x12, 0x64, 0xfe, 0x88, 0xd7, 0xa1, 0x3b, 0x4d, 0xb6, 0xc0, 0x5a, 0x2c, 0x73, 0x5, 0x9f, 0xe9, 0x21, 0x57, 0xcd, 0xbb, 0xe4, 0x92, 0x8, 0x7e, 0xe3, 0x95, 0xf, 0x79, 0x26, 0x50, 0xca, 0xbc, 0x74, 0x2, 0x98, 0xee, 0xb1, 0xc7, 0x5d, 0x2b, 0xd0, 0xa6, 0x3c, 0x4a, 0x15, 0x63, 0xf9, 0x8f, 0x47, 0x31, 0xab, 0xdd, 0x82, 0xf4, 0x6e, 0x18, 0x49, 0x3f, 0xa5, 0xd3, 0x8c, 0xfa, 0x60, 0x16, 0xde, 0xa8, 0x32, 0x44, 0x1b, 0x6d, 0xf7, 0x81, 0x7a, 0xc, 0x96, 0xe0, 0xbf, 0xc9, 0x53, 0x25, 0xed, 0x9b, 0x1, 0x77, 0x28, 0x5e, 0xc4, 0xb2, 0x2f, 0x59, 0xc3, 0xb5, 0xea, 0x9c, 0x6, 0x70, 0xb8, 0xce, 0x54, 0x22, 0x7d, 0xb, 0x91, 0xe7, 0x1c, 0x6a, 0xf0, 0x86, 0xd9, 0xaf, 0x35, 0x43, 0x8b, 0xfd, 0x67, 0x11, 0x4e, 0x38, 0xa2, 0xd4},
+ {0x0, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x6, 0x5e, 0x29, 0xb0, 0xc7, 0x23, 0x54, 0xcd, 0xba, 0xe2, 0x95, 0xc, 0x7b, 0xbc, 0xcb, 0x52, 0x25, 0x7d, 0xa, 0x93, 0xe4, 0x46, 0x31, 0xa8, 0xdf, 0x87, 0xf0, 0x69, 0x1e, 0xd9, 0xae, 0x37, 0x40, 0x18, 0x6f, 0xf6, 0x81, 0x65, 0x12, 0x8b, 0xfc, 0xa4, 0xd3, 0x4a, 0x3d, 0xfa, 0x8d, 0x14, 0x63, 0x3b, 0x4c, 0xd5, 0xa2, 0x8c, 0xfb, 0x62, 0x15, 0x4d, 0x3a, 0xa3, 0xd4, 0x13, 0x64, 0xfd, 0x8a, 0xd2, 0xa5, 0x3c, 0x4b, 0xaf, 0xd8, 0x41, 0x36, 0x6e, 0x19, 0x80, 0xf7, 0x30, 0x47, 0xde, 0xa9, 0xf1, 0x86, 0x1f, 0x68, 0xca, 0xbd, 0x24, 0x53, 0xb, 0x7c, 0xe5, 0x92, 0x55, 0x22, 0xbb, 0xcc, 0x94, 0xe3, 0x7a, 0xd, 0xe9, 0x9e, 0x7, 0x70, 0x28, 0x5f, 0xc6, 0xb1, 0x76, 0x1, 0x98, 0xef, 0xb7, 0xc0, 0x59, 0x2e, 0x5, 0x72, 0xeb, 0x9c, 0xc4, 0xb3, 0x2a, 0x5d, 0x9a, 0xed, 0x74, 0x3, 0x5b, 0x2c, 0xb5, 0xc2, 0x26, 0x51, 0xc8, 0xbf, 0xe7, 0x90, 0x9, 0x7e, 0xb9, 0xce, 0x57, 0x20, 0x78, 0xf, 0x96, 0xe1, 0x43, 0x34, 0xad, 0xda, 0x82, 0xf5, 0x6c, 0x1b, 0xdc, 0xab, 0x32, 0x45, 0x1d, 0x6a, 0xf3, 0x84, 0x60, 0x17, 0x8e, 0xf9, 0xa1, 0xd6, 0x4f, 0x38, 0xff, 0x88, 0x11, 0x66, 0x3e, 0x49, 0xd0, 0xa7, 0x89, 0xfe, 0x67, 0x10, 0x48, 0x3f, 0xa6, 0xd1, 0x16, 0x61, 0xf8, 0x8f, 0xd7, 0xa0, 0x39, 0x4e, 0xaa, 0xdd, 0x44, 0x33, 0x6b, 0x1c, 0x85, 0xf2, 0x35, 0x42, 0xdb, 0xac, 0xf4, 0x83, 0x1a, 0x6d, 0xcf, 0xb8, 0x21, 0x56, 0xe, 0x79, 0xe0, 0x97, 0x50, 0x27, 0xbe, 0xc9, 0x91, 0xe6, 0x7f, 0x8, 0xec, 0x9b, 0x2, 0x75, 0x2d, 0x5a, 0xc3, 0xb4, 0x73, 0x4, 0x9d, 0xea, 0xb2, 0xc5, 0x5c, 0x2b},
+ {0x0, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0xd, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92, 0xd3, 0xab, 0x23, 0x5b, 0x2e, 0x56, 0xde, 0xa6, 0x34, 0x4c, 0xc4, 0xbc, 0xc9, 0xb1, 0x39, 0x41, 0xbb, 0xc3, 0x4b, 0x33, 0x46, 0x3e, 0xb6, 0xce, 0x5c, 0x24, 0xac, 0xd4, 0xa1, 0xd9, 0x51, 0x29, 0x68, 0x10, 0x98, 0xe0, 0x95, 0xed, 0x65, 0x1d, 0x8f, 0xf7, 0x7f, 0x7, 0x72, 0xa, 0x82, 0xfa, 0x6b, 0x13, 0x9b, 0xe3, 0x96, 0xee, 0x66, 0x1e, 0x8c, 0xf4, 0x7c, 0x4, 0x71, 0x9, 0x81, 0xf9, 0xb8, 0xc0, 0x48, 0x30, 0x45, 0x3d, 0xb5, 0xcd, 0x5f, 0x27, 0xaf, 0xd7, 0xa2, 0xda, 0x52, 0x2a, 0xd0, 0xa8, 0x20, 0x58, 0x2d, 0x55, 0xdd, 0xa5, 0x37, 0x4f, 0xc7, 0xbf, 0xca, 0xb2, 0x3a, 0x42, 0x3, 0x7b, 0xf3, 0x8b, 0xfe, 0x86, 0xe, 0x76, 0xe4, 0x9c, 0x14, 0x6c, 0x19, 0x61, 0xe9, 0x91, 0xd6, 0xae, 0x26, 0x5e, 0x2b, 0x53, 0xdb, 0xa3, 0x31, 0x49, 0xc1, 0xb9, 0xcc, 0xb4, 0x3c, 0x44, 0x5, 0x7d, 0xf5, 0x8d, 0xf8, 0x80, 0x8, 0x70, 0xe2, 0x9a, 0x12, 0x6a, 0x1f, 0x67, 0xef, 0x97, 0x6d, 0x15, 0x9d, 0xe5, 0x90, 0xe8, 0x60, 0x18, 0x8a, 0xf2, 0x7a, 0x2, 0x77, 0xf, 0x87, 0xff, 0xbe, 0xc6, 0x4e, 0x36, 0x43, 0x3b, 0xb3, 0xcb, 0x59, 0x21, 0xa9, 0xd1, 0xa4, 0xdc, 0x54, 0x2c, 0xbd, 0xc5, 0x4d, 0x35, 0x40, 0x38, 0xb0, 0xc8, 0x5a, 0x22, 0xaa, 0xd2, 0xa7, 0xdf, 0x57, 0x2f, 0x6e, 0x16, 0x9e, 0xe6, 0x93, 0xeb, 0x63, 0x1b, 0x89, 0xf1, 0x79, 0x1, 0x74, 0xc, 0x84, 0xfc, 0x6, 0x7e, 0xf6, 0x8e, 0xfb, 0x83, 0xb, 0x73, 0xe1, 0x99, 0x11, 0x69, 0x1c, 0x64, 0xec, 0x94, 0xd5, 0xad, 0x25, 0x5d, 0x28, 0x50, 0xd8, 0xa0, 0x32, 0x4a, 0xc2, 0xba, 0xcf, 0xb7, 0x3f, 0x47},
+ {0x0, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0xb, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d, 0xc3, 0xba, 0x31, 0x48, 0x3a, 0x43, 0xc8, 0xb1, 0x2c, 0x55, 0xde, 0xa7, 0xd5, 0xac, 0x27, 0x5e, 0x9b, 0xe2, 0x69, 0x10, 0x62, 0x1b, 0x90, 0xe9, 0x74, 0xd, 0x86, 0xff, 0x8d, 0xf4, 0x7f, 0x6, 0x58, 0x21, 0xaa, 0xd3, 0xa1, 0xd8, 0x53, 0x2a, 0xb7, 0xce, 0x45, 0x3c, 0x4e, 0x37, 0xbc, 0xc5, 0x2b, 0x52, 0xd9, 0xa0, 0xd2, 0xab, 0x20, 0x59, 0xc4, 0xbd, 0x36, 0x4f, 0x3d, 0x44, 0xcf, 0xb6, 0xe8, 0x91, 0x1a, 0x63, 0x11, 0x68, 0xe3, 0x9a, 0x7, 0x7e, 0xf5, 0x8c, 0xfe, 0x87, 0xc, 0x75, 0xb0, 0xc9, 0x42, 0x3b, 0x49, 0x30, 0xbb, 0xc2, 0x5f, 0x26, 0xad, 0xd4, 0xa6, 0xdf, 0x54, 0x2d, 0x73, 0xa, 0x81, 0xf8, 0x8a, 0xf3, 0x78, 0x1, 0x9c, 0xe5, 0x6e, 0x17, 0x65, 0x1c, 0x97, 0xee, 0x56, 0x2f, 0xa4, 0xdd, 0xaf, 0xd6, 0x5d, 0x24, 0xb9, 0xc0, 0x4b, 0x32, 0x40, 0x39, 0xb2, 0xcb, 0x95, 0xec, 0x67, 0x1e, 0x6c, 0x15, 0x9e, 0xe7, 0x7a, 0x3, 0x88, 0xf1, 0x83, 0xfa, 0x71, 0x8, 0xcd, 0xb4, 0x3f, 0x46, 0x34, 0x4d, 0xc6, 0xbf, 0x22, 0x5b, 0xd0, 0xa9, 0xdb, 0xa2, 0x29, 0x50, 0xe, 0x77, 0xfc, 0x85, 0xf7, 0x8e, 0x5, 0x7c, 0xe1, 0x98, 0x13, 0x6a, 0x18, 0x61, 0xea, 0x93, 0x7d, 0x4, 0x8f, 0xf6, 0x84, 0xfd, 0x76, 0xf, 0x92, 0xeb, 0x60, 0x19, 0x6b, 0x12, 0x99, 0xe0, 0xbe, 0xc7, 0x4c, 0x35, 0x47, 0x3e, 0xb5, 0xcc, 0x51, 0x28, 0xa3, 0xda, 0xa8, 0xd1, 0x5a, 0x23, 0xe6, 0x9f, 0x14, 0x6d, 0x1f, 0x66, 0xed, 0x94, 0x9, 0x70, 0xfb, 0x82, 0xf0, 0x89, 0x2, 0x7b, 0x25, 0x5c, 0xd7, 0xae, 0xdc, 0xa5, 0x2e, 0x57, 0xca, 0xb3, 0x38, 0x41, 0x33, 0x4a, 0xc1, 0xb8},
+ {0x0, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x1, 0x7b, 0xf7, 0x8d, 0x3, 0x79, 0x2, 0x78, 0xf6, 0x8c, 0xf3, 0x89, 0x7, 0x7d, 0x6, 0x7c, 0xf2, 0x88, 0x4, 0x7e, 0xf0, 0x8a, 0xf1, 0x8b, 0x5, 0x7f, 0xfb, 0x81, 0xf, 0x75, 0xe, 0x74, 0xfa, 0x80, 0xc, 0x76, 0xf8, 0x82, 0xf9, 0x83, 0xd, 0x77, 0x8, 0x72, 0xfc, 0x86, 0xfd, 0x87, 0x9, 0x73, 0xff, 0x85, 0xb, 0x71, 0xa, 0x70, 0xfe, 0x84, 0xeb, 0x91, 0x1f, 0x65, 0x1e, 0x64, 0xea, 0x90, 0x1c, 0x66, 0xe8, 0x92, 0xe9, 0x93, 0x1d, 0x67, 0x18, 0x62, 0xec, 0x96, 0xed, 0x97, 0x19, 0x63, 0xef, 0x95, 0x1b, 0x61, 0x1a, 0x60, 0xee, 0x94, 0x10, 0x6a, 0xe4, 0x9e, 0xe5, 0x9f, 0x11, 0x6b, 0xe7, 0x9d, 0x13, 0x69, 0x12, 0x68, 0xe6, 0x9c, 0xe3, 0x99, 0x17, 0x6d, 0x16, 0x6c, 0xe2, 0x98, 0x14, 0x6e, 0xe0, 0x9a, 0xe1, 0x9b, 0x15, 0x6f, 0xcb, 0xb1, 0x3f, 0x45, 0x3e, 0x44, 0xca, 0xb0, 0x3c, 0x46, 0xc8, 0xb2, 0xc9, 0xb3, 0x3d, 0x47, 0x38, 0x42, 0xcc, 0xb6, 0xcd, 0xb7, 0x39, 0x43, 0xcf, 0xb5, 0x3b, 0x41, 0x3a, 0x40, 0xce, 0xb4, 0x30, 0x4a, 0xc4, 0xbe, 0xc5, 0xbf, 0x31, 0x4b, 0xc7, 0xbd, 0x33, 0x49, 0x32, 0x48, 0xc6, 0xbc, 0xc3, 0xb9, 0x37, 0x4d, 0x36, 0x4c, 0xc2, 0xb8, 0x34, 0x4e, 0xc0, 0xba, 0xc1, 0xbb, 0x35, 0x4f, 0x20, 0x5a, 0xd4, 0xae, 0xd5, 0xaf, 0x21, 0x5b, 0xd7, 0xad, 0x23, 0x59, 0x22, 0x58, 0xd6, 0xac, 0xd3, 0xa9, 0x27, 0x5d, 0x26, 0x5c, 0xd2, 0xa8, 0x24, 0x5e, 0xd0, 0xaa, 0xd1, 0xab, 0x25, 0x5f, 0xdb, 0xa1, 0x2f, 0x55, 0x2e, 0x54, 0xda, 0xa0, 0x2c, 0x56, 0xd8, 0xa2, 0xd9, 0xa3, 0x2d, 0x57, 0x28, 0x52, 0xdc, 0xa6, 0xdd, 0xa7, 0x29, 0x53, 0xdf, 0xa5, 0x2b, 0x51, 0x2a, 0x50, 0xde, 0xa4},
+ {0x0, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x7, 0x7c, 0xff, 0x84, 0x9, 0x72, 0xe, 0x75, 0xf8, 0x83, 0xe3, 0x98, 0x15, 0x6e, 0x12, 0x69, 0xe4, 0x9f, 0x1c, 0x67, 0xea, 0x91, 0xed, 0x96, 0x1b, 0x60, 0xdb, 0xa0, 0x2d, 0x56, 0x2a, 0x51, 0xdc, 0xa7, 0x24, 0x5f, 0xd2, 0xa9, 0xd5, 0xae, 0x23, 0x58, 0x38, 0x43, 0xce, 0xb5, 0xc9, 0xb2, 0x3f, 0x44, 0xc7, 0xbc, 0x31, 0x4a, 0x36, 0x4d, 0xc0, 0xbb, 0xab, 0xd0, 0x5d, 0x26, 0x5a, 0x21, 0xac, 0xd7, 0x54, 0x2f, 0xa2, 0xd9, 0xa5, 0xde, 0x53, 0x28, 0x48, 0x33, 0xbe, 0xc5, 0xb9, 0xc2, 0x4f, 0x34, 0xb7, 0xcc, 0x41, 0x3a, 0x46, 0x3d, 0xb0, 0xcb, 0x70, 0xb, 0x86, 0xfd, 0x81, 0xfa, 0x77, 0xc, 0x8f, 0xf4, 0x79, 0x2, 0x7e, 0x5, 0x88, 0xf3, 0x93, 0xe8, 0x65, 0x1e, 0x62, 0x19, 0x94, 0xef, 0x6c, 0x17, 0x9a, 0xe1, 0x9d, 0xe6, 0x6b, 0x10, 0x4b, 0x30, 0xbd, 0xc6, 0xba, 0xc1, 0x4c, 0x37, 0xb4, 0xcf, 0x42, 0x39, 0x45, 0x3e, 0xb3, 0xc8, 0xa8, 0xd3, 0x5e, 0x25, 0x59, 0x22, 0xaf, 0xd4, 0x57, 0x2c, 0xa1, 0xda, 0xa6, 0xdd, 0x50, 0x2b, 0x90, 0xeb, 0x66, 0x1d, 0x61, 0x1a, 0x97, 0xec, 0x6f, 0x14, 0x99, 0xe2, 0x9e, 0xe5, 0x68, 0x13, 0x73, 0x8, 0x85, 0xfe, 0x82, 0xf9, 0x74, 0xf, 0x8c, 0xf7, 0x7a, 0x1, 0x7d, 0x6, 0x8b, 0xf0, 0xe0, 0x9b, 0x16, 0x6d, 0x11, 0x6a, 0xe7, 0x9c, 0x1f, 0x64, 0xe9, 0x92, 0xee, 0x95, 0x18, 0x63, 0x3, 0x78, 0xf5, 0x8e, 0xf2, 0x89, 0x4, 0x7f, 0xfc, 0x87, 0xa, 0x71, 0xd, 0x76, 0xfb, 0x80, 0x3b, 0x40, 0xcd, 0xb6, 0xca, 0xb1, 0x3c, 0x47, 0xc4, 0xbf, 0x32, 0x49, 0x35, 0x4e, 0xc3, 0xb8, 0xd8, 0xa3, 0x2e, 0x55, 0x29, 0x52, 0xdf, 0xa4, 0x27, 0x5c, 0xd1, 0xaa, 0xd6, 0xad, 0x20, 0x5b},
+ {0x0, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae, 0x93, 0xef, 0x6b, 0x17, 0x7e, 0x2, 0x86, 0xfa, 0x54, 0x28, 0xac, 0xd0, 0xb9, 0xc5, 0x41, 0x3d, 0x3b, 0x47, 0xc3, 0xbf, 0xd6, 0xaa, 0x2e, 0x52, 0xfc, 0x80, 0x4, 0x78, 0x11, 0x6d, 0xe9, 0x95, 0xa8, 0xd4, 0x50, 0x2c, 0x45, 0x39, 0xbd, 0xc1, 0x6f, 0x13, 0x97, 0xeb, 0x82, 0xfe, 0x7a, 0x6, 0x76, 0xa, 0x8e, 0xf2, 0x9b, 0xe7, 0x63, 0x1f, 0xb1, 0xcd, 0x49, 0x35, 0x5c, 0x20, 0xa4, 0xd8, 0xe5, 0x99, 0x1d, 0x61, 0x8, 0x74, 0xf0, 0x8c, 0x22, 0x5e, 0xda, 0xa6, 0xcf, 0xb3, 0x37, 0x4b, 0x4d, 0x31, 0xb5, 0xc9, 0xa0, 0xdc, 0x58, 0x24, 0x8a, 0xf6, 0x72, 0xe, 0x67, 0x1b, 0x9f, 0xe3, 0xde, 0xa2, 0x26, 0x5a, 0x33, 0x4f, 0xcb, 0xb7, 0x19, 0x65, 0xe1, 0x9d, 0xf4, 0x88, 0xc, 0x70, 0xec, 0x90, 0x14, 0x68, 0x1, 0x7d, 0xf9, 0x85, 0x2b, 0x57, 0xd3, 0xaf, 0xc6, 0xba, 0x3e, 0x42, 0x7f, 0x3, 0x87, 0xfb, 0x92, 0xee, 0x6a, 0x16, 0xb8, 0xc4, 0x40, 0x3c, 0x55, 0x29, 0xad, 0xd1, 0xd7, 0xab, 0x2f, 0x53, 0x3a, 0x46, 0xc2, 0xbe, 0x10, 0x6c, 0xe8, 0x94, 0xfd, 0x81, 0x5, 0x79, 0x44, 0x38, 0xbc, 0xc0, 0xa9, 0xd5, 0x51, 0x2d, 0x83, 0xff, 0x7b, 0x7, 0x6e, 0x12, 0x96, 0xea, 0x9a, 0xe6, 0x62, 0x1e, 0x77, 0xb, 0x8f, 0xf3, 0x5d, 0x21, 0xa5, 0xd9, 0xb0, 0xcc, 0x48, 0x34, 0x9, 0x75, 0xf1, 0x8d, 0xe4, 0x98, 0x1c, 0x60, 0xce, 0xb2, 0x36, 0x4a, 0x23, 0x5f, 0xdb, 0xa7, 0xa1, 0xdd, 0x59, 0x25, 0x4c, 0x30, 0xb4, 0xc8, 0x66, 0x1a, 0x9e, 0xe2, 0x8b, 0xf7, 0x73, 0xf, 0x32, 0x4e, 0xca, 0xb6, 0xdf, 0xa3, 0x27, 0x5b, 0xf5, 0x89, 0xd, 0x71, 0x18, 0x64, 0xe0, 0x9c},
+ {0x0, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1, 0x83, 0xfe, 0x79, 0x4, 0x6a, 0x17, 0x90, 0xed, 0x4c, 0x31, 0xb6, 0xcb, 0xa5, 0xd8, 0x5f, 0x22, 0x1b, 0x66, 0xe1, 0x9c, 0xf2, 0x8f, 0x8, 0x75, 0xd4, 0xa9, 0x2e, 0x53, 0x3d, 0x40, 0xc7, 0xba, 0x98, 0xe5, 0x62, 0x1f, 0x71, 0xc, 0x8b, 0xf6, 0x57, 0x2a, 0xad, 0xd0, 0xbe, 0xc3, 0x44, 0x39, 0x36, 0x4b, 0xcc, 0xb1, 0xdf, 0xa2, 0x25, 0x58, 0xf9, 0x84, 0x3, 0x7e, 0x10, 0x6d, 0xea, 0x97, 0xb5, 0xc8, 0x4f, 0x32, 0x5c, 0x21, 0xa6, 0xdb, 0x7a, 0x7, 0x80, 0xfd, 0x93, 0xee, 0x69, 0x14, 0x2d, 0x50, 0xd7, 0xaa, 0xc4, 0xb9, 0x3e, 0x43, 0xe2, 0x9f, 0x18, 0x65, 0xb, 0x76, 0xf1, 0x8c, 0xae, 0xd3, 0x54, 0x29, 0x47, 0x3a, 0xbd, 0xc0, 0x61, 0x1c, 0x9b, 0xe6, 0x88, 0xf5, 0x72, 0xf, 0x6c, 0x11, 0x96, 0xeb, 0x85, 0xf8, 0x7f, 0x2, 0xa3, 0xde, 0x59, 0x24, 0x4a, 0x37, 0xb0, 0xcd, 0xef, 0x92, 0x15, 0x68, 0x6, 0x7b, 0xfc, 0x81, 0x20, 0x5d, 0xda, 0xa7, 0xc9, 0xb4, 0x33, 0x4e, 0x77, 0xa, 0x8d, 0xf0, 0x9e, 0xe3, 0x64, 0x19, 0xb8, 0xc5, 0x42, 0x3f, 0x51, 0x2c, 0xab, 0xd6, 0xf4, 0x89, 0xe, 0x73, 0x1d, 0x60, 0xe7, 0x9a, 0x3b, 0x46, 0xc1, 0xbc, 0xd2, 0xaf, 0x28, 0x55, 0x5a, 0x27, 0xa0, 0xdd, 0xb3, 0xce, 0x49, 0x34, 0x95, 0xe8, 0x6f, 0x12, 0x7c, 0x1, 0x86, 0xfb, 0xd9, 0xa4, 0x23, 0x5e, 0x30, 0x4d, 0xca, 0xb7, 0x16, 0x6b, 0xec, 0x91, 0xff, 0x82, 0x5, 0x78, 0x41, 0x3c, 0xbb, 0xc6, 0xa8, 0xd5, 0x52, 0x2f, 0x8e, 0xf3, 0x74, 0x9, 0x67, 0x1a, 0x9d, 0xe0, 0xc2, 0xbf, 0x38, 0x45, 0x2b, 0x56, 0xd1, 0xac, 0xd, 0x70, 0xf7, 0x8a, 0xe4, 0x99, 0x1e, 0x63},
+ {0x0, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0, 0xb3, 0xcd, 0x4f, 0x31, 0x56, 0x28, 0xaa, 0xd4, 0x64, 0x1a, 0x98, 0xe6, 0x81, 0xff, 0x7d, 0x3, 0x7b, 0x5, 0x87, 0xf9, 0x9e, 0xe0, 0x62, 0x1c, 0xac, 0xd2, 0x50, 0x2e, 0x49, 0x37, 0xb5, 0xcb, 0xc8, 0xb6, 0x34, 0x4a, 0x2d, 0x53, 0xd1, 0xaf, 0x1f, 0x61, 0xe3, 0x9d, 0xfa, 0x84, 0x6, 0x78, 0xf6, 0x88, 0xa, 0x74, 0x13, 0x6d, 0xef, 0x91, 0x21, 0x5f, 0xdd, 0xa3, 0xc4, 0xba, 0x38, 0x46, 0x45, 0x3b, 0xb9, 0xc7, 0xa0, 0xde, 0x5c, 0x22, 0x92, 0xec, 0x6e, 0x10, 0x77, 0x9, 0x8b, 0xf5, 0x8d, 0xf3, 0x71, 0xf, 0x68, 0x16, 0x94, 0xea, 0x5a, 0x24, 0xa6, 0xd8, 0xbf, 0xc1, 0x43, 0x3d, 0x3e, 0x40, 0xc2, 0xbc, 0xdb, 0xa5, 0x27, 0x59, 0xe9, 0x97, 0x15, 0x6b, 0xc, 0x72, 0xf0, 0x8e, 0xf1, 0x8f, 0xd, 0x73, 0x14, 0x6a, 0xe8, 0x96, 0x26, 0x58, 0xda, 0xa4, 0xc3, 0xbd, 0x3f, 0x41, 0x42, 0x3c, 0xbe, 0xc0, 0xa7, 0xd9, 0x5b, 0x25, 0x95, 0xeb, 0x69, 0x17, 0x70, 0xe, 0x8c, 0xf2, 0x8a, 0xf4, 0x76, 0x8, 0x6f, 0x11, 0x93, 0xed, 0x5d, 0x23, 0xa1, 0xdf, 0xb8, 0xc6, 0x44, 0x3a, 0x39, 0x47, 0xc5, 0xbb, 0xdc, 0xa2, 0x20, 0x5e, 0xee, 0x90, 0x12, 0x6c, 0xb, 0x75, 0xf7, 0x89, 0x7, 0x79, 0xfb, 0x85, 0xe2, 0x9c, 0x1e, 0x60, 0xd0, 0xae, 0x2c, 0x52, 0x35, 0x4b, 0xc9, 0xb7, 0xb4, 0xca, 0x48, 0x36, 0x51, 0x2f, 0xad, 0xd3, 0x63, 0x1d, 0x9f, 0xe1, 0x86, 0xf8, 0x7a, 0x4, 0x7c, 0x2, 0x80, 0xfe, 0x99, 0xe7, 0x65, 0x1b, 0xab, 0xd5, 0x57, 0x29, 0x4e, 0x30, 0xb2, 0xcc, 0xcf, 0xb1, 0x33, 0x4d, 0x2a, 0x54, 0xd6, 0xa8, 0x18, 0x66, 0xe4, 0x9a, 0xfd, 0x83, 0x1, 0x7f},
+ {0x0, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf, 0xa3, 0xdc, 0x5d, 0x22, 0x42, 0x3d, 0xbc, 0xc3, 0x7c, 0x3, 0x82, 0xfd, 0x9d, 0xe2, 0x63, 0x1c, 0x5b, 0x24, 0xa5, 0xda, 0xba, 0xc5, 0x44, 0x3b, 0x84, 0xfb, 0x7a, 0x5, 0x65, 0x1a, 0x9b, 0xe4, 0xf8, 0x87, 0x6, 0x79, 0x19, 0x66, 0xe7, 0x98, 0x27, 0x58, 0xd9, 0xa6, 0xc6, 0xb9, 0x38, 0x47, 0xb6, 0xc9, 0x48, 0x37, 0x57, 0x28, 0xa9, 0xd6, 0x69, 0x16, 0x97, 0xe8, 0x88, 0xf7, 0x76, 0x9, 0x15, 0x6a, 0xeb, 0x94, 0xf4, 0x8b, 0xa, 0x75, 0xca, 0xb5, 0x34, 0x4b, 0x2b, 0x54, 0xd5, 0xaa, 0xed, 0x92, 0x13, 0x6c, 0xc, 0x73, 0xf2, 0x8d, 0x32, 0x4d, 0xcc, 0xb3, 0xd3, 0xac, 0x2d, 0x52, 0x4e, 0x31, 0xb0, 0xcf, 0xaf, 0xd0, 0x51, 0x2e, 0x91, 0xee, 0x6f, 0x10, 0x70, 0xf, 0x8e, 0xf1, 0x71, 0xe, 0x8f, 0xf0, 0x90, 0xef, 0x6e, 0x11, 0xae, 0xd1, 0x50, 0x2f, 0x4f, 0x30, 0xb1, 0xce, 0xd2, 0xad, 0x2c, 0x53, 0x33, 0x4c, 0xcd, 0xb2, 0xd, 0x72, 0xf3, 0x8c, 0xec, 0x93, 0x12, 0x6d, 0x2a, 0x55, 0xd4, 0xab, 0xcb, 0xb4, 0x35, 0x4a, 0xf5, 0x8a, 0xb, 0x74, 0x14, 0x6b, 0xea, 0x95, 0x89, 0xf6, 0x77, 0x8, 0x68, 0x17, 0x96, 0xe9, 0x56, 0x29, 0xa8, 0xd7, 0xb7, 0xc8, 0x49, 0x36, 0xc7, 0xb8, 0x39, 0x46, 0x26, 0x59, 0xd8, 0xa7, 0x18, 0x67, 0xe6, 0x99, 0xf9, 0x86, 0x7, 0x78, 0x64, 0x1b, 0x9a, 0xe5, 0x85, 0xfa, 0x7b, 0x4, 0xbb, 0xc4, 0x45, 0x3a, 0x5a, 0x25, 0xa4, 0xdb, 0x9c, 0xe3, 0x62, 0x1d, 0x7d, 0x2, 0x83, 0xfc, 0x43, 0x3c, 0xbd, 0xc2, 0xa2, 0xdd, 0x5c, 0x23, 0x3f, 0x40, 0xc1, 0xbe, 0xde, 0xa1, 0x20, 0x5f, 0xe0, 0x9f, 0x1e, 0x61, 0x1, 0x7e, 0xff, 0x80},
+ {0x0, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3, 0xe8, 0x68, 0xf5, 0x75, 0xd2, 0x52, 0xcf, 0x4f, 0x9c, 0x1c, 0x81, 0x1, 0xa6, 0x26, 0xbb, 0x3b, 0xcd, 0x4d, 0xd0, 0x50, 0xf7, 0x77, 0xea, 0x6a, 0xb9, 0x39, 0xa4, 0x24, 0x83, 0x3, 0x9e, 0x1e, 0x25, 0xa5, 0x38, 0xb8, 0x1f, 0x9f, 0x2, 0x82, 0x51, 0xd1, 0x4c, 0xcc, 0x6b, 0xeb, 0x76, 0xf6, 0x87, 0x7, 0x9a, 0x1a, 0xbd, 0x3d, 0xa0, 0x20, 0xf3, 0x73, 0xee, 0x6e, 0xc9, 0x49, 0xd4, 0x54, 0x6f, 0xef, 0x72, 0xf2, 0x55, 0xd5, 0x48, 0xc8, 0x1b, 0x9b, 0x6, 0x86, 0x21, 0xa1, 0x3c, 0xbc, 0x4a, 0xca, 0x57, 0xd7, 0x70, 0xf0, 0x6d, 0xed, 0x3e, 0xbe, 0x23, 0xa3, 0x4, 0x84, 0x19, 0x99, 0xa2, 0x22, 0xbf, 0x3f, 0x98, 0x18, 0x85, 0x5, 0xd6, 0x56, 0xcb, 0x4b, 0xec, 0x6c, 0xf1, 0x71, 0x13, 0x93, 0xe, 0x8e, 0x29, 0xa9, 0x34, 0xb4, 0x67, 0xe7, 0x7a, 0xfa, 0x5d, 0xdd, 0x40, 0xc0, 0xfb, 0x7b, 0xe6, 0x66, 0xc1, 0x41, 0xdc, 0x5c, 0x8f, 0xf, 0x92, 0x12, 0xb5, 0x35, 0xa8, 0x28, 0xde, 0x5e, 0xc3, 0x43, 0xe4, 0x64, 0xf9, 0x79, 0xaa, 0x2a, 0xb7, 0x37, 0x90, 0x10, 0x8d, 0xd, 0x36, 0xb6, 0x2b, 0xab, 0xc, 0x8c, 0x11, 0x91, 0x42, 0xc2, 0x5f, 0xdf, 0x78, 0xf8, 0x65, 0xe5, 0x94, 0x14, 0x89, 0x9, 0xae, 0x2e, 0xb3, 0x33, 0xe0, 0x60, 0xfd, 0x7d, 0xda, 0x5a, 0xc7, 0x47, 0x7c, 0xfc, 0x61, 0xe1, 0x46, 0xc6, 0x5b, 0xdb, 0x8, 0x88, 0x15, 0x95, 0x32, 0xb2, 0x2f, 0xaf, 0x59, 0xd9, 0x44, 0xc4, 0x63, 0xe3, 0x7e, 0xfe, 0x2d, 0xad, 0x30, 0xb0, 0x17, 0x97, 0xa, 0x8a, 0xb1, 0x31, 0xac, 0x2c, 0x8b, 0xb, 0x96, 0x16, 0xc5, 0x45, 0xd8, 0x58, 0xff, 0x7f, 0xe2, 0x62},
+ {0x0, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc, 0xf8, 0x79, 0xe7, 0x66, 0xc6, 0x47, 0xd9, 0x58, 0x84, 0x5, 0x9b, 0x1a, 0xba, 0x3b, 0xa5, 0x24, 0xed, 0x6c, 0xf2, 0x73, 0xd3, 0x52, 0xcc, 0x4d, 0x91, 0x10, 0x8e, 0xf, 0xaf, 0x2e, 0xb0, 0x31, 0x15, 0x94, 0xa, 0x8b, 0x2b, 0xaa, 0x34, 0xb5, 0x69, 0xe8, 0x76, 0xf7, 0x57, 0xd6, 0x48, 0xc9, 0xc7, 0x46, 0xd8, 0x59, 0xf9, 0x78, 0xe6, 0x67, 0xbb, 0x3a, 0xa4, 0x25, 0x85, 0x4, 0x9a, 0x1b, 0x3f, 0xbe, 0x20, 0xa1, 0x1, 0x80, 0x1e, 0x9f, 0x43, 0xc2, 0x5c, 0xdd, 0x7d, 0xfc, 0x62, 0xe3, 0x2a, 0xab, 0x35, 0xb4, 0x14, 0x95, 0xb, 0x8a, 0x56, 0xd7, 0x49, 0xc8, 0x68, 0xe9, 0x77, 0xf6, 0xd2, 0x53, 0xcd, 0x4c, 0xec, 0x6d, 0xf3, 0x72, 0xae, 0x2f, 0xb1, 0x30, 0x90, 0x11, 0x8f, 0xe, 0x93, 0x12, 0x8c, 0xd, 0xad, 0x2c, 0xb2, 0x33, 0xef, 0x6e, 0xf0, 0x71, 0xd1, 0x50, 0xce, 0x4f, 0x6b, 0xea, 0x74, 0xf5, 0x55, 0xd4, 0x4a, 0xcb, 0x17, 0x96, 0x8, 0x89, 0x29, 0xa8, 0x36, 0xb7, 0x7e, 0xff, 0x61, 0xe0, 0x40, 0xc1, 0x5f, 0xde, 0x2, 0x83, 0x1d, 0x9c, 0x3c, 0xbd, 0x23, 0xa2, 0x86, 0x7, 0x99, 0x18, 0xb8, 0x39, 0xa7, 0x26, 0xfa, 0x7b, 0xe5, 0x64, 0xc4, 0x45, 0xdb, 0x5a, 0x54, 0xd5, 0x4b, 0xca, 0x6a, 0xeb, 0x75, 0xf4, 0x28, 0xa9, 0x37, 0xb6, 0x16, 0x97, 0x9, 0x88, 0xac, 0x2d, 0xb3, 0x32, 0x92, 0x13, 0x8d, 0xc, 0xd0, 0x51, 0xcf, 0x4e, 0xee, 0x6f, 0xf1, 0x70, 0xb9, 0x38, 0xa6, 0x27, 0x87, 0x6, 0x98, 0x19, 0xc5, 0x44, 0xda, 0x5b, 0xfb, 0x7a, 0xe4, 0x65, 0x41, 0xc0, 0x5e, 0xdf, 0x7f, 0xfe, 0x60, 0xe1, 0x3d, 0xbc, 0x22, 0xa3, 0x3, 0x82, 0x1c, 0x9d},
+ {0x0, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd, 0xc8, 0x4a, 0xd1, 0x53, 0xfa, 0x78, 0xe3, 0x61, 0xac, 0x2e, 0xb5, 0x37, 0x9e, 0x1c, 0x87, 0x5, 0x8d, 0xf, 0x94, 0x16, 0xbf, 0x3d, 0xa6, 0x24, 0xe9, 0x6b, 0xf0, 0x72, 0xdb, 0x59, 0xc2, 0x40, 0x45, 0xc7, 0x5c, 0xde, 0x77, 0xf5, 0x6e, 0xec, 0x21, 0xa3, 0x38, 0xba, 0x13, 0x91, 0xa, 0x88, 0x7, 0x85, 0x1e, 0x9c, 0x35, 0xb7, 0x2c, 0xae, 0x63, 0xe1, 0x7a, 0xf8, 0x51, 0xd3, 0x48, 0xca, 0xcf, 0x4d, 0xd6, 0x54, 0xfd, 0x7f, 0xe4, 0x66, 0xab, 0x29, 0xb2, 0x30, 0x99, 0x1b, 0x80, 0x2, 0x8a, 0x8, 0x93, 0x11, 0xb8, 0x3a, 0xa1, 0x23, 0xee, 0x6c, 0xf7, 0x75, 0xdc, 0x5e, 0xc5, 0x47, 0x42, 0xc0, 0x5b, 0xd9, 0x70, 0xf2, 0x69, 0xeb, 0x26, 0xa4, 0x3f, 0xbd, 0x14, 0x96, 0xd, 0x8f, 0xe, 0x8c, 0x17, 0x95, 0x3c, 0xbe, 0x25, 0xa7, 0x6a, 0xe8, 0x73, 0xf1, 0x58, 0xda, 0x41, 0xc3, 0xc6, 0x44, 0xdf, 0x5d, 0xf4, 0x76, 0xed, 0x6f, 0xa2, 0x20, 0xbb, 0x39, 0x90, 0x12, 0x89, 0xb, 0x83, 0x1, 0x9a, 0x18, 0xb1, 0x33, 0xa8, 0x2a, 0xe7, 0x65, 0xfe, 0x7c, 0xd5, 0x57, 0xcc, 0x4e, 0x4b, 0xc9, 0x52, 0xd0, 0x79, 0xfb, 0x60, 0xe2, 0x2f, 0xad, 0x36, 0xb4, 0x1d, 0x9f, 0x4, 0x86, 0x9, 0x8b, 0x10, 0x92, 0x3b, 0xb9, 0x22, 0xa0, 0x6d, 0xef, 0x74, 0xf6, 0x5f, 0xdd, 0x46, 0xc4, 0xc1, 0x43, 0xd8, 0x5a, 0xf3, 0x71, 0xea, 0x68, 0xa5, 0x27, 0xbc, 0x3e, 0x97, 0x15, 0x8e, 0xc, 0x84, 0x6, 0x9d, 0x1f, 0xb6, 0x34, 0xaf, 0x2d, 0xe0, 0x62, 0xf9, 0x7b, 0xd2, 0x50, 0xcb, 0x49, 0x4c, 0xce, 0x55, 0xd7, 0x7e, 0xfc, 0x67, 0xe5, 0x28, 0xaa, 0x31, 0xb3, 0x1a, 0x98, 0x3, 0x81},
+ {0x0, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2, 0xd8, 0x5b, 0xc3, 0x40, 0xee, 0x6d, 0xf5, 0x76, 0xb4, 0x37, 0xaf, 0x2c, 0x82, 0x1, 0x99, 0x1a, 0xad, 0x2e, 0xb6, 0x35, 0x9b, 0x18, 0x80, 0x3, 0xc1, 0x42, 0xda, 0x59, 0xf7, 0x74, 0xec, 0x6f, 0x75, 0xf6, 0x6e, 0xed, 0x43, 0xc0, 0x58, 0xdb, 0x19, 0x9a, 0x2, 0x81, 0x2f, 0xac, 0x34, 0xb7, 0x47, 0xc4, 0x5c, 0xdf, 0x71, 0xf2, 0x6a, 0xe9, 0x2b, 0xa8, 0x30, 0xb3, 0x1d, 0x9e, 0x6, 0x85, 0x9f, 0x1c, 0x84, 0x7, 0xa9, 0x2a, 0xb2, 0x31, 0xf3, 0x70, 0xe8, 0x6b, 0xc5, 0x46, 0xde, 0x5d, 0xea, 0x69, 0xf1, 0x72, 0xdc, 0x5f, 0xc7, 0x44, 0x86, 0x5, 0x9d, 0x1e, 0xb0, 0x33, 0xab, 0x28, 0x32, 0xb1, 0x29, 0xaa, 0x4, 0x87, 0x1f, 0x9c, 0x5e, 0xdd, 0x45, 0xc6, 0x68, 0xeb, 0x73, 0xf0, 0x8e, 0xd, 0x95, 0x16, 0xb8, 0x3b, 0xa3, 0x20, 0xe2, 0x61, 0xf9, 0x7a, 0xd4, 0x57, 0xcf, 0x4c, 0x56, 0xd5, 0x4d, 0xce, 0x60, 0xe3, 0x7b, 0xf8, 0x3a, 0xb9, 0x21, 0xa2, 0xc, 0x8f, 0x17, 0x94, 0x23, 0xa0, 0x38, 0xbb, 0x15, 0x96, 0xe, 0x8d, 0x4f, 0xcc, 0x54, 0xd7, 0x79, 0xfa, 0x62, 0xe1, 0xfb, 0x78, 0xe0, 0x63, 0xcd, 0x4e, 0xd6, 0x55, 0x97, 0x14, 0x8c, 0xf, 0xa1, 0x22, 0xba, 0x39, 0xc9, 0x4a, 0xd2, 0x51, 0xff, 0x7c, 0xe4, 0x67, 0xa5, 0x26, 0xbe, 0x3d, 0x93, 0x10, 0x88, 0xb, 0x11, 0x92, 0xa, 0x89, 0x27, 0xa4, 0x3c, 0xbf, 0x7d, 0xfe, 0x66, 0xe5, 0x4b, 0xc8, 0x50, 0xd3, 0x64, 0xe7, 0x7f, 0xfc, 0x52, 0xd1, 0x49, 0xca, 0x8, 0x8b, 0x13, 0x90, 0x3e, 0xbd, 0x25, 0xa6, 0xbc, 0x3f, 0xa7, 0x24, 0x8a, 0x9, 0x91, 0x12, 0xd0, 0x53, 0xcb, 0x48, 0xe6, 0x65, 0xfd, 0x7e},
+ {0x0, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef, 0xa8, 0x2c, 0xbd, 0x39, 0x82, 0x6, 0x97, 0x13, 0xfc, 0x78, 0xe9, 0x6d, 0xd6, 0x52, 0xc3, 0x47, 0x4d, 0xc9, 0x58, 0xdc, 0x67, 0xe3, 0x72, 0xf6, 0x19, 0x9d, 0xc, 0x88, 0x33, 0xb7, 0x26, 0xa2, 0xe5, 0x61, 0xf0, 0x74, 0xcf, 0x4b, 0xda, 0x5e, 0xb1, 0x35, 0xa4, 0x20, 0x9b, 0x1f, 0x8e, 0xa, 0x9a, 0x1e, 0x8f, 0xb, 0xb0, 0x34, 0xa5, 0x21, 0xce, 0x4a, 0xdb, 0x5f, 0xe4, 0x60, 0xf1, 0x75, 0x32, 0xb6, 0x27, 0xa3, 0x18, 0x9c, 0xd, 0x89, 0x66, 0xe2, 0x73, 0xf7, 0x4c, 0xc8, 0x59, 0xdd, 0xd7, 0x53, 0xc2, 0x46, 0xfd, 0x79, 0xe8, 0x6c, 0x83, 0x7, 0x96, 0x12, 0xa9, 0x2d, 0xbc, 0x38, 0x7f, 0xfb, 0x6a, 0xee, 0x55, 0xd1, 0x40, 0xc4, 0x2b, 0xaf, 0x3e, 0xba, 0x1, 0x85, 0x14, 0x90, 0x29, 0xad, 0x3c, 0xb8, 0x3, 0x87, 0x16, 0x92, 0x7d, 0xf9, 0x68, 0xec, 0x57, 0xd3, 0x42, 0xc6, 0x81, 0x5, 0x94, 0x10, 0xab, 0x2f, 0xbe, 0x3a, 0xd5, 0x51, 0xc0, 0x44, 0xff, 0x7b, 0xea, 0x6e, 0x64, 0xe0, 0x71, 0xf5, 0x4e, 0xca, 0x5b, 0xdf, 0x30, 0xb4, 0x25, 0xa1, 0x1a, 0x9e, 0xf, 0x8b, 0xcc, 0x48, 0xd9, 0x5d, 0xe6, 0x62, 0xf3, 0x77, 0x98, 0x1c, 0x8d, 0x9, 0xb2, 0x36, 0xa7, 0x23, 0xb3, 0x37, 0xa6, 0x22, 0x99, 0x1d, 0x8c, 0x8, 0xe7, 0x63, 0xf2, 0x76, 0xcd, 0x49, 0xd8, 0x5c, 0x1b, 0x9f, 0xe, 0x8a, 0x31, 0xb5, 0x24, 0xa0, 0x4f, 0xcb, 0x5a, 0xde, 0x65, 0xe1, 0x70, 0xf4, 0xfe, 0x7a, 0xeb, 0x6f, 0xd4, 0x50, 0xc1, 0x45, 0xaa, 0x2e, 0xbf, 0x3b, 0x80, 0x4, 0x95, 0x11, 0x56, 0xd2, 0x43, 0xc7, 0x7c, 0xf8, 0x69, 0xed, 0x2, 0x86, 0x17, 0x93, 0x28, 0xac, 0x3d, 0xb9},
+ {0x0, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0, 0xb8, 0x3d, 0xaf, 0x2a, 0x96, 0x13, 0x81, 0x4, 0xe4, 0x61, 0xf3, 0x76, 0xca, 0x4f, 0xdd, 0x58, 0x6d, 0xe8, 0x7a, 0xff, 0x43, 0xc6, 0x54, 0xd1, 0x31, 0xb4, 0x26, 0xa3, 0x1f, 0x9a, 0x8, 0x8d, 0xd5, 0x50, 0xc2, 0x47, 0xfb, 0x7e, 0xec, 0x69, 0x89, 0xc, 0x9e, 0x1b, 0xa7, 0x22, 0xb0, 0x35, 0xda, 0x5f, 0xcd, 0x48, 0xf4, 0x71, 0xe3, 0x66, 0x86, 0x3, 0x91, 0x14, 0xa8, 0x2d, 0xbf, 0x3a, 0x62, 0xe7, 0x75, 0xf0, 0x4c, 0xc9, 0x5b, 0xde, 0x3e, 0xbb, 0x29, 0xac, 0x10, 0x95, 0x7, 0x82, 0xb7, 0x32, 0xa0, 0x25, 0x99, 0x1c, 0x8e, 0xb, 0xeb, 0x6e, 0xfc, 0x79, 0xc5, 0x40, 0xd2, 0x57, 0xf, 0x8a, 0x18, 0x9d, 0x21, 0xa4, 0x36, 0xb3, 0x53, 0xd6, 0x44, 0xc1, 0x7d, 0xf8, 0x6a, 0xef, 0xa9, 0x2c, 0xbe, 0x3b, 0x87, 0x2, 0x90, 0x15, 0xf5, 0x70, 0xe2, 0x67, 0xdb, 0x5e, 0xcc, 0x49, 0x11, 0x94, 0x6, 0x83, 0x3f, 0xba, 0x28, 0xad, 0x4d, 0xc8, 0x5a, 0xdf, 0x63, 0xe6, 0x74, 0xf1, 0xc4, 0x41, 0xd3, 0x56, 0xea, 0x6f, 0xfd, 0x78, 0x98, 0x1d, 0x8f, 0xa, 0xb6, 0x33, 0xa1, 0x24, 0x7c, 0xf9, 0x6b, 0xee, 0x52, 0xd7, 0x45, 0xc0, 0x20, 0xa5, 0x37, 0xb2, 0xe, 0x8b, 0x19, 0x9c, 0x73, 0xf6, 0x64, 0xe1, 0x5d, 0xd8, 0x4a, 0xcf, 0x2f, 0xaa, 0x38, 0xbd, 0x1, 0x84, 0x16, 0x93, 0xcb, 0x4e, 0xdc, 0x59, 0xe5, 0x60, 0xf2, 0x77, 0x97, 0x12, 0x80, 0x5, 0xb9, 0x3c, 0xae, 0x2b, 0x1e, 0x9b, 0x9, 0x8c, 0x30, 0xb5, 0x27, 0xa2, 0x42, 0xc7, 0x55, 0xd0, 0x6c, 0xe9, 0x7b, 0xfe, 0xa6, 0x23, 0xb1, 0x34, 0x88, 0xd, 0x9f, 0x1a, 0xfa, 0x7f, 0xed, 0x68, 0xd4, 0x51, 0xc3, 0x46},
+ {0x0, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1, 0x88, 0xe, 0x99, 0x1f, 0xaa, 0x2c, 0xbb, 0x3d, 0xcc, 0x4a, 0xdd, 0x5b, 0xee, 0x68, 0xff, 0x79, 0xd, 0x8b, 0x1c, 0x9a, 0x2f, 0xa9, 0x3e, 0xb8, 0x49, 0xcf, 0x58, 0xde, 0x6b, 0xed, 0x7a, 0xfc, 0x85, 0x3, 0x94, 0x12, 0xa7, 0x21, 0xb6, 0x30, 0xc1, 0x47, 0xd0, 0x56, 0xe3, 0x65, 0xf2, 0x74, 0x1a, 0x9c, 0xb, 0x8d, 0x38, 0xbe, 0x29, 0xaf, 0x5e, 0xd8, 0x4f, 0xc9, 0x7c, 0xfa, 0x6d, 0xeb, 0x92, 0x14, 0x83, 0x5, 0xb0, 0x36, 0xa1, 0x27, 0xd6, 0x50, 0xc7, 0x41, 0xf4, 0x72, 0xe5, 0x63, 0x17, 0x91, 0x6, 0x80, 0x35, 0xb3, 0x24, 0xa2, 0x53, 0xd5, 0x42, 0xc4, 0x71, 0xf7, 0x60, 0xe6, 0x9f, 0x19, 0x8e, 0x8, 0xbd, 0x3b, 0xac, 0x2a, 0xdb, 0x5d, 0xca, 0x4c, 0xf9, 0x7f, 0xe8, 0x6e, 0x34, 0xb2, 0x25, 0xa3, 0x16, 0x90, 0x7, 0x81, 0x70, 0xf6, 0x61, 0xe7, 0x52, 0xd4, 0x43, 0xc5, 0xbc, 0x3a, 0xad, 0x2b, 0x9e, 0x18, 0x8f, 0x9, 0xf8, 0x7e, 0xe9, 0x6f, 0xda, 0x5c, 0xcb, 0x4d, 0x39, 0xbf, 0x28, 0xae, 0x1b, 0x9d, 0xa, 0x8c, 0x7d, 0xfb, 0x6c, 0xea, 0x5f, 0xd9, 0x4e, 0xc8, 0xb1, 0x37, 0xa0, 0x26, 0x93, 0x15, 0x82, 0x4, 0xf5, 0x73, 0xe4, 0x62, 0xd7, 0x51, 0xc6, 0x40, 0x2e, 0xa8, 0x3f, 0xb9, 0xc, 0x8a, 0x1d, 0x9b, 0x6a, 0xec, 0x7b, 0xfd, 0x48, 0xce, 0x59, 0xdf, 0xa6, 0x20, 0xb7, 0x31, 0x84, 0x2, 0x95, 0x13, 0xe2, 0x64, 0xf3, 0x75, 0xc0, 0x46, 0xd1, 0x57, 0x23, 0xa5, 0x32, 0xb4, 0x1, 0x87, 0x10, 0x96, 0x67, 0xe1, 0x76, 0xf0, 0x45, 0xc3, 0x54, 0xd2, 0xab, 0x2d, 0xba, 0x3c, 0x89, 0xf, 0x98, 0x1e, 0xef, 0x69, 0xfe, 0x78, 0xcd, 0x4b, 0xdc, 0x5a},
+ {0x0, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe, 0x98, 0x1f, 0x8b, 0xc, 0xbe, 0x39, 0xad, 0x2a, 0xd4, 0x53, 0xc7, 0x40, 0xf2, 0x75, 0xe1, 0x66, 0x2d, 0xaa, 0x3e, 0xb9, 0xb, 0x8c, 0x18, 0x9f, 0x61, 0xe6, 0x72, 0xf5, 0x47, 0xc0, 0x54, 0xd3, 0xb5, 0x32, 0xa6, 0x21, 0x93, 0x14, 0x80, 0x7, 0xf9, 0x7e, 0xea, 0x6d, 0xdf, 0x58, 0xcc, 0x4b, 0x5a, 0xdd, 0x49, 0xce, 0x7c, 0xfb, 0x6f, 0xe8, 0x16, 0x91, 0x5, 0x82, 0x30, 0xb7, 0x23, 0xa4, 0xc2, 0x45, 0xd1, 0x56, 0xe4, 0x63, 0xf7, 0x70, 0x8e, 0x9, 0x9d, 0x1a, 0xa8, 0x2f, 0xbb, 0x3c, 0x77, 0xf0, 0x64, 0xe3, 0x51, 0xd6, 0x42, 0xc5, 0x3b, 0xbc, 0x28, 0xaf, 0x1d, 0x9a, 0xe, 0x89, 0xef, 0x68, 0xfc, 0x7b, 0xc9, 0x4e, 0xda, 0x5d, 0xa3, 0x24, 0xb0, 0x37, 0x85, 0x2, 0x96, 0x11, 0xb4, 0x33, 0xa7, 0x20, 0x92, 0x15, 0x81, 0x6, 0xf8, 0x7f, 0xeb, 0x6c, 0xde, 0x59, 0xcd, 0x4a, 0x2c, 0xab, 0x3f, 0xb8, 0xa, 0x8d, 0x19, 0x9e, 0x60, 0xe7, 0x73, 0xf4, 0x46, 0xc1, 0x55, 0xd2, 0x99, 0x1e, 0x8a, 0xd, 0xbf, 0x38, 0xac, 0x2b, 0xd5, 0x52, 0xc6, 0x41, 0xf3, 0x74, 0xe0, 0x67, 0x1, 0x86, 0x12, 0x95, 0x27, 0xa0, 0x34, 0xb3, 0x4d, 0xca, 0x5e, 0xd9, 0x6b, 0xec, 0x78, 0xff, 0xee, 0x69, 0xfd, 0x7a, 0xc8, 0x4f, 0xdb, 0x5c, 0xa2, 0x25, 0xb1, 0x36, 0x84, 0x3, 0x97, 0x10, 0x76, 0xf1, 0x65, 0xe2, 0x50, 0xd7, 0x43, 0xc4, 0x3a, 0xbd, 0x29, 0xae, 0x1c, 0x9b, 0xf, 0x88, 0xc3, 0x44, 0xd0, 0x57, 0xe5, 0x62, 0xf6, 0x71, 0x8f, 0x8, 0x9c, 0x1b, 0xa9, 0x2e, 0xba, 0x3d, 0x5b, 0xdc, 0x48, 0xcf, 0x7d, 0xfa, 0x6e, 0xe9, 0x17, 0x90, 0x4, 0x83, 0x31, 0xb6, 0x22, 0xa5},
+ {0x0, 0x88, 0xd, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab, 0x68, 0xe0, 0x65, 0xed, 0x72, 0xfa, 0x7f, 0xf7, 0x5c, 0xd4, 0x51, 0xd9, 0x46, 0xce, 0x4b, 0xc3, 0xd0, 0x58, 0xdd, 0x55, 0xca, 0x42, 0xc7, 0x4f, 0xe4, 0x6c, 0xe9, 0x61, 0xfe, 0x76, 0xf3, 0x7b, 0xb8, 0x30, 0xb5, 0x3d, 0xa2, 0x2a, 0xaf, 0x27, 0x8c, 0x4, 0x81, 0x9, 0x96, 0x1e, 0x9b, 0x13, 0xbd, 0x35, 0xb0, 0x38, 0xa7, 0x2f, 0xaa, 0x22, 0x89, 0x1, 0x84, 0xc, 0x93, 0x1b, 0x9e, 0x16, 0xd5, 0x5d, 0xd8, 0x50, 0xcf, 0x47, 0xc2, 0x4a, 0xe1, 0x69, 0xec, 0x64, 0xfb, 0x73, 0xf6, 0x7e, 0x6d, 0xe5, 0x60, 0xe8, 0x77, 0xff, 0x7a, 0xf2, 0x59, 0xd1, 0x54, 0xdc, 0x43, 0xcb, 0x4e, 0xc6, 0x5, 0x8d, 0x8, 0x80, 0x1f, 0x97, 0x12, 0x9a, 0x31, 0xb9, 0x3c, 0xb4, 0x2b, 0xa3, 0x26, 0xae, 0x67, 0xef, 0x6a, 0xe2, 0x7d, 0xf5, 0x70, 0xf8, 0x53, 0xdb, 0x5e, 0xd6, 0x49, 0xc1, 0x44, 0xcc, 0xf, 0x87, 0x2, 0x8a, 0x15, 0x9d, 0x18, 0x90, 0x3b, 0xb3, 0x36, 0xbe, 0x21, 0xa9, 0x2c, 0xa4, 0xb7, 0x3f, 0xba, 0x32, 0xad, 0x25, 0xa0, 0x28, 0x83, 0xb, 0x8e, 0x6, 0x99, 0x11, 0x94, 0x1c, 0xdf, 0x57, 0xd2, 0x5a, 0xc5, 0x4d, 0xc8, 0x40, 0xeb, 0x63, 0xe6, 0x6e, 0xf1, 0x79, 0xfc, 0x74, 0xda, 0x52, 0xd7, 0x5f, 0xc0, 0x48, 0xcd, 0x45, 0xee, 0x66, 0xe3, 0x6b, 0xf4, 0x7c, 0xf9, 0x71, 0xb2, 0x3a, 0xbf, 0x37, 0xa8, 0x20, 0xa5, 0x2d, 0x86, 0xe, 0x8b, 0x3, 0x9c, 0x14, 0x91, 0x19, 0xa, 0x82, 0x7, 0x8f, 0x10, 0x98, 0x1d, 0x95, 0x3e, 0xb6, 0x33, 0xbb, 0x24, 0xac, 0x29, 0xa1, 0x62, 0xea, 0x6f, 0xe7, 0x78, 0xf0, 0x75, 0xfd, 0x56, 0xde, 0x5b, 0xd3, 0x4c, 0xc4, 0x41, 0xc9},
+ {0x0, 0x89, 0xf, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4, 0x78, 0xf1, 0x77, 0xfe, 0x66, 0xef, 0x69, 0xe0, 0x44, 0xcd, 0x4b, 0xc2, 0x5a, 0xd3, 0x55, 0xdc, 0xf0, 0x79, 0xff, 0x76, 0xee, 0x67, 0xe1, 0x68, 0xcc, 0x45, 0xc3, 0x4a, 0xd2, 0x5b, 0xdd, 0x54, 0x88, 0x1, 0x87, 0xe, 0x96, 0x1f, 0x99, 0x10, 0xb4, 0x3d, 0xbb, 0x32, 0xaa, 0x23, 0xa5, 0x2c, 0xfd, 0x74, 0xf2, 0x7b, 0xe3, 0x6a, 0xec, 0x65, 0xc1, 0x48, 0xce, 0x47, 0xdf, 0x56, 0xd0, 0x59, 0x85, 0xc, 0x8a, 0x3, 0x9b, 0x12, 0x94, 0x1d, 0xb9, 0x30, 0xb6, 0x3f, 0xa7, 0x2e, 0xa8, 0x21, 0xd, 0x84, 0x2, 0x8b, 0x13, 0x9a, 0x1c, 0x95, 0x31, 0xb8, 0x3e, 0xb7, 0x2f, 0xa6, 0x20, 0xa9, 0x75, 0xfc, 0x7a, 0xf3, 0x6b, 0xe2, 0x64, 0xed, 0x49, 0xc0, 0x46, 0xcf, 0x57, 0xde, 0x58, 0xd1, 0xe7, 0x6e, 0xe8, 0x61, 0xf9, 0x70, 0xf6, 0x7f, 0xdb, 0x52, 0xd4, 0x5d, 0xc5, 0x4c, 0xca, 0x43, 0x9f, 0x16, 0x90, 0x19, 0x81, 0x8, 0x8e, 0x7, 0xa3, 0x2a, 0xac, 0x25, 0xbd, 0x34, 0xb2, 0x3b, 0x17, 0x9e, 0x18, 0x91, 0x9, 0x80, 0x6, 0x8f, 0x2b, 0xa2, 0x24, 0xad, 0x35, 0xbc, 0x3a, 0xb3, 0x6f, 0xe6, 0x60, 0xe9, 0x71, 0xf8, 0x7e, 0xf7, 0x53, 0xda, 0x5c, 0xd5, 0x4d, 0xc4, 0x42, 0xcb, 0x1a, 0x93, 0x15, 0x9c, 0x4, 0x8d, 0xb, 0x82, 0x26, 0xaf, 0x29, 0xa0, 0x38, 0xb1, 0x37, 0xbe, 0x62, 0xeb, 0x6d, 0xe4, 0x7c, 0xf5, 0x73, 0xfa, 0x5e, 0xd7, 0x51, 0xd8, 0x40, 0xc9, 0x4f, 0xc6, 0xea, 0x63, 0xe5, 0x6c, 0xf4, 0x7d, 0xfb, 0x72, 0xd6, 0x5f, 0xd9, 0x50, 0xc8, 0x41, 0xc7, 0x4e, 0x92, 0x1b, 0x9d, 0x14, 0x8c, 0x5, 0x83, 0xa, 0xae, 0x27, 0xa1, 0x28, 0xb0, 0x39, 0xbf, 0x36},
+ {0x0, 0x8a, 0x9, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5, 0x48, 0xc2, 0x41, 0xcb, 0x5a, 0xd0, 0x53, 0xd9, 0x6c, 0xe6, 0x65, 0xef, 0x7e, 0xf4, 0x77, 0xfd, 0x90, 0x1a, 0x99, 0x13, 0x82, 0x8, 0x8b, 0x1, 0xb4, 0x3e, 0xbd, 0x37, 0xa6, 0x2c, 0xaf, 0x25, 0xd8, 0x52, 0xd1, 0x5b, 0xca, 0x40, 0xc3, 0x49, 0xfc, 0x76, 0xf5, 0x7f, 0xee, 0x64, 0xe7, 0x6d, 0x3d, 0xb7, 0x34, 0xbe, 0x2f, 0xa5, 0x26, 0xac, 0x19, 0x93, 0x10, 0x9a, 0xb, 0x81, 0x2, 0x88, 0x75, 0xff, 0x7c, 0xf6, 0x67, 0xed, 0x6e, 0xe4, 0x51, 0xdb, 0x58, 0xd2, 0x43, 0xc9, 0x4a, 0xc0, 0xad, 0x27, 0xa4, 0x2e, 0xbf, 0x35, 0xb6, 0x3c, 0x89, 0x3, 0x80, 0xa, 0x9b, 0x11, 0x92, 0x18, 0xe5, 0x6f, 0xec, 0x66, 0xf7, 0x7d, 0xfe, 0x74, 0xc1, 0x4b, 0xc8, 0x42, 0xd3, 0x59, 0xda, 0x50, 0x7a, 0xf0, 0x73, 0xf9, 0x68, 0xe2, 0x61, 0xeb, 0x5e, 0xd4, 0x57, 0xdd, 0x4c, 0xc6, 0x45, 0xcf, 0x32, 0xb8, 0x3b, 0xb1, 0x20, 0xaa, 0x29, 0xa3, 0x16, 0x9c, 0x1f, 0x95, 0x4, 0x8e, 0xd, 0x87, 0xea, 0x60, 0xe3, 0x69, 0xf8, 0x72, 0xf1, 0x7b, 0xce, 0x44, 0xc7, 0x4d, 0xdc, 0x56, 0xd5, 0x5f, 0xa2, 0x28, 0xab, 0x21, 0xb0, 0x3a, 0xb9, 0x33, 0x86, 0xc, 0x8f, 0x5, 0x94, 0x1e, 0x9d, 0x17, 0x47, 0xcd, 0x4e, 0xc4, 0x55, 0xdf, 0x5c, 0xd6, 0x63, 0xe9, 0x6a, 0xe0, 0x71, 0xfb, 0x78, 0xf2, 0xf, 0x85, 0x6, 0x8c, 0x1d, 0x97, 0x14, 0x9e, 0x2b, 0xa1, 0x22, 0xa8, 0x39, 0xb3, 0x30, 0xba, 0xd7, 0x5d, 0xde, 0x54, 0xc5, 0x4f, 0xcc, 0x46, 0xf3, 0x79, 0xfa, 0x70, 0xe1, 0x6b, 0xe8, 0x62, 0x9f, 0x15, 0x96, 0x1c, 0x8d, 0x7, 0x84, 0xe, 0xbb, 0x31, 0xb2, 0x38, 0xa9, 0x23, 0xa0, 0x2a},
+ {0x0, 0x8b, 0xb, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba, 0x58, 0xd3, 0x53, 0xd8, 0x4e, 0xc5, 0x45, 0xce, 0x74, 0xff, 0x7f, 0xf4, 0x62, 0xe9, 0x69, 0xe2, 0xb0, 0x3b, 0xbb, 0x30, 0xa6, 0x2d, 0xad, 0x26, 0x9c, 0x17, 0x97, 0x1c, 0x8a, 0x1, 0x81, 0xa, 0xe8, 0x63, 0xe3, 0x68, 0xfe, 0x75, 0xf5, 0x7e, 0xc4, 0x4f, 0xcf, 0x44, 0xd2, 0x59, 0xd9, 0x52, 0x7d, 0xf6, 0x76, 0xfd, 0x6b, 0xe0, 0x60, 0xeb, 0x51, 0xda, 0x5a, 0xd1, 0x47, 0xcc, 0x4c, 0xc7, 0x25, 0xae, 0x2e, 0xa5, 0x33, 0xb8, 0x38, 0xb3, 0x9, 0x82, 0x2, 0x89, 0x1f, 0x94, 0x14, 0x9f, 0xcd, 0x46, 0xc6, 0x4d, 0xdb, 0x50, 0xd0, 0x5b, 0xe1, 0x6a, 0xea, 0x61, 0xf7, 0x7c, 0xfc, 0x77, 0x95, 0x1e, 0x9e, 0x15, 0x83, 0x8, 0x88, 0x3, 0xb9, 0x32, 0xb2, 0x39, 0xaf, 0x24, 0xa4, 0x2f, 0xfa, 0x71, 0xf1, 0x7a, 0xec, 0x67, 0xe7, 0x6c, 0xd6, 0x5d, 0xdd, 0x56, 0xc0, 0x4b, 0xcb, 0x40, 0xa2, 0x29, 0xa9, 0x22, 0xb4, 0x3f, 0xbf, 0x34, 0x8e, 0x5, 0x85, 0xe, 0x98, 0x13, 0x93, 0x18, 0x4a, 0xc1, 0x41, 0xca, 0x5c, 0xd7, 0x57, 0xdc, 0x66, 0xed, 0x6d, 0xe6, 0x70, 0xfb, 0x7b, 0xf0, 0x12, 0x99, 0x19, 0x92, 0x4, 0x8f, 0xf, 0x84, 0x3e, 0xb5, 0x35, 0xbe, 0x28, 0xa3, 0x23, 0xa8, 0x87, 0xc, 0x8c, 0x7, 0x91, 0x1a, 0x9a, 0x11, 0xab, 0x20, 0xa0, 0x2b, 0xbd, 0x36, 0xb6, 0x3d, 0xdf, 0x54, 0xd4, 0x5f, 0xc9, 0x42, 0xc2, 0x49, 0xf3, 0x78, 0xf8, 0x73, 0xe5, 0x6e, 0xee, 0x65, 0x37, 0xbc, 0x3c, 0xb7, 0x21, 0xaa, 0x2a, 0xa1, 0x1b, 0x90, 0x10, 0x9b, 0xd, 0x86, 0x6, 0x8d, 0x6f, 0xe4, 0x64, 0xef, 0x79, 0xf2, 0x72, 0xf9, 0x43, 0xc8, 0x48, 0xc3, 0x55, 0xde, 0x5e, 0xd5},
+ {0x0, 0x8c, 0x5, 0x89, 0xa, 0x86, 0xf, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97, 0x28, 0xa4, 0x2d, 0xa1, 0x22, 0xae, 0x27, 0xab, 0x3c, 0xb0, 0x39, 0xb5, 0x36, 0xba, 0x33, 0xbf, 0x50, 0xdc, 0x55, 0xd9, 0x5a, 0xd6, 0x5f, 0xd3, 0x44, 0xc8, 0x41, 0xcd, 0x4e, 0xc2, 0x4b, 0xc7, 0x78, 0xf4, 0x7d, 0xf1, 0x72, 0xfe, 0x77, 0xfb, 0x6c, 0xe0, 0x69, 0xe5, 0x66, 0xea, 0x63, 0xef, 0xa0, 0x2c, 0xa5, 0x29, 0xaa, 0x26, 0xaf, 0x23, 0xb4, 0x38, 0xb1, 0x3d, 0xbe, 0x32, 0xbb, 0x37, 0x88, 0x4, 0x8d, 0x1, 0x82, 0xe, 0x87, 0xb, 0x9c, 0x10, 0x99, 0x15, 0x96, 0x1a, 0x93, 0x1f, 0xf0, 0x7c, 0xf5, 0x79, 0xfa, 0x76, 0xff, 0x73, 0xe4, 0x68, 0xe1, 0x6d, 0xee, 0x62, 0xeb, 0x67, 0xd8, 0x54, 0xdd, 0x51, 0xd2, 0x5e, 0xd7, 0x5b, 0xcc, 0x40, 0xc9, 0x45, 0xc6, 0x4a, 0xc3, 0x4f, 0x5d, 0xd1, 0x58, 0xd4, 0x57, 0xdb, 0x52, 0xde, 0x49, 0xc5, 0x4c, 0xc0, 0x43, 0xcf, 0x46, 0xca, 0x75, 0xf9, 0x70, 0xfc, 0x7f, 0xf3, 0x7a, 0xf6, 0x61, 0xed, 0x64, 0xe8, 0x6b, 0xe7, 0x6e, 0xe2, 0xd, 0x81, 0x8, 0x84, 0x7, 0x8b, 0x2, 0x8e, 0x19, 0x95, 0x1c, 0x90, 0x13, 0x9f, 0x16, 0x9a, 0x25, 0xa9, 0x20, 0xac, 0x2f, 0xa3, 0x2a, 0xa6, 0x31, 0xbd, 0x34, 0xb8, 0x3b, 0xb7, 0x3e, 0xb2, 0xfd, 0x71, 0xf8, 0x74, 0xf7, 0x7b, 0xf2, 0x7e, 0xe9, 0x65, 0xec, 0x60, 0xe3, 0x6f, 0xe6, 0x6a, 0xd5, 0x59, 0xd0, 0x5c, 0xdf, 0x53, 0xda, 0x56, 0xc1, 0x4d, 0xc4, 0x48, 0xcb, 0x47, 0xce, 0x42, 0xad, 0x21, 0xa8, 0x24, 0xa7, 0x2b, 0xa2, 0x2e, 0xb9, 0x35, 0xbc, 0x30, 0xb3, 0x3f, 0xb6, 0x3a, 0x85, 0x9, 0x80, 0xc, 0x8f, 0x3, 0x8a, 0x6, 0x91, 0x1d, 0x94, 0x18, 0x9b, 0x17, 0x9e, 0x12},
+ {0x0, 0x8d, 0x7, 0x8a, 0xe, 0x83, 0x9, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98, 0x38, 0xb5, 0x3f, 0xb2, 0x36, 0xbb, 0x31, 0xbc, 0x24, 0xa9, 0x23, 0xae, 0x2a, 0xa7, 0x2d, 0xa0, 0x70, 0xfd, 0x77, 0xfa, 0x7e, 0xf3, 0x79, 0xf4, 0x6c, 0xe1, 0x6b, 0xe6, 0x62, 0xef, 0x65, 0xe8, 0x48, 0xc5, 0x4f, 0xc2, 0x46, 0xcb, 0x41, 0xcc, 0x54, 0xd9, 0x53, 0xde, 0x5a, 0xd7, 0x5d, 0xd0, 0xe0, 0x6d, 0xe7, 0x6a, 0xee, 0x63, 0xe9, 0x64, 0xfc, 0x71, 0xfb, 0x76, 0xf2, 0x7f, 0xf5, 0x78, 0xd8, 0x55, 0xdf, 0x52, 0xd6, 0x5b, 0xd1, 0x5c, 0xc4, 0x49, 0xc3, 0x4e, 0xca, 0x47, 0xcd, 0x40, 0x90, 0x1d, 0x97, 0x1a, 0x9e, 0x13, 0x99, 0x14, 0x8c, 0x1, 0x8b, 0x6, 0x82, 0xf, 0x85, 0x8, 0xa8, 0x25, 0xaf, 0x22, 0xa6, 0x2b, 0xa1, 0x2c, 0xb4, 0x39, 0xb3, 0x3e, 0xba, 0x37, 0xbd, 0x30, 0xdd, 0x50, 0xda, 0x57, 0xd3, 0x5e, 0xd4, 0x59, 0xc1, 0x4c, 0xc6, 0x4b, 0xcf, 0x42, 0xc8, 0x45, 0xe5, 0x68, 0xe2, 0x6f, 0xeb, 0x66, 0xec, 0x61, 0xf9, 0x74, 0xfe, 0x73, 0xf7, 0x7a, 0xf0, 0x7d, 0xad, 0x20, 0xaa, 0x27, 0xa3, 0x2e, 0xa4, 0x29, 0xb1, 0x3c, 0xb6, 0x3b, 0xbf, 0x32, 0xb8, 0x35, 0x95, 0x18, 0x92, 0x1f, 0x9b, 0x16, 0x9c, 0x11, 0x89, 0x4, 0x8e, 0x3, 0x87, 0xa, 0x80, 0xd, 0x3d, 0xb0, 0x3a, 0xb7, 0x33, 0xbe, 0x34, 0xb9, 0x21, 0xac, 0x26, 0xab, 0x2f, 0xa2, 0x28, 0xa5, 0x5, 0x88, 0x2, 0x8f, 0xb, 0x86, 0xc, 0x81, 0x19, 0x94, 0x1e, 0x93, 0x17, 0x9a, 0x10, 0x9d, 0x4d, 0xc0, 0x4a, 0xc7, 0x43, 0xce, 0x44, 0xc9, 0x51, 0xdc, 0x56, 0xdb, 0x5f, 0xd2, 0x58, 0xd5, 0x75, 0xf8, 0x72, 0xff, 0x7b, 0xf6, 0x7c, 0xf1, 0x69, 0xe4, 0x6e, 0xe3, 0x67, 0xea, 0x60, 0xed},
+ {0x0, 0x8e, 0x1, 0x8f, 0x2, 0x8c, 0x3, 0x8d, 0x4, 0x8a, 0x5, 0x8b, 0x6, 0x88, 0x7, 0x89, 0x8, 0x86, 0x9, 0x87, 0xa, 0x84, 0xb, 0x85, 0xc, 0x82, 0xd, 0x83, 0xe, 0x80, 0xf, 0x81, 0x10, 0x9e, 0x11, 0x9f, 0x12, 0x9c, 0x13, 0x9d, 0x14, 0x9a, 0x15, 0x9b, 0x16, 0x98, 0x17, 0x99, 0x18, 0x96, 0x19, 0x97, 0x1a, 0x94, 0x1b, 0x95, 0x1c, 0x92, 0x1d, 0x93, 0x1e, 0x90, 0x1f, 0x91, 0x20, 0xae, 0x21, 0xaf, 0x22, 0xac, 0x23, 0xad, 0x24, 0xaa, 0x25, 0xab, 0x26, 0xa8, 0x27, 0xa9, 0x28, 0xa6, 0x29, 0xa7, 0x2a, 0xa4, 0x2b, 0xa5, 0x2c, 0xa2, 0x2d, 0xa3, 0x2e, 0xa0, 0x2f, 0xa1, 0x30, 0xbe, 0x31, 0xbf, 0x32, 0xbc, 0x33, 0xbd, 0x34, 0xba, 0x35, 0xbb, 0x36, 0xb8, 0x37, 0xb9, 0x38, 0xb6, 0x39, 0xb7, 0x3a, 0xb4, 0x3b, 0xb5, 0x3c, 0xb2, 0x3d, 0xb3, 0x3e, 0xb0, 0x3f, 0xb1, 0x40, 0xce, 0x41, 0xcf, 0x42, 0xcc, 0x43, 0xcd, 0x44, 0xca, 0x45, 0xcb, 0x46, 0xc8, 0x47, 0xc9, 0x48, 0xc6, 0x49, 0xc7, 0x4a, 0xc4, 0x4b, 0xc5, 0x4c, 0xc2, 0x4d, 0xc3, 0x4e, 0xc0, 0x4f, 0xc1, 0x50, 0xde, 0x51, 0xdf, 0x52, 0xdc, 0x53, 0xdd, 0x54, 0xda, 0x55, 0xdb, 0x56, 0xd8, 0x57, 0xd9, 0x58, 0xd6, 0x59, 0xd7, 0x5a, 0xd4, 0x5b, 0xd5, 0x5c, 0xd2, 0x5d, 0xd3, 0x5e, 0xd0, 0x5f, 0xd1, 0x60, 0xee, 0x61, 0xef, 0x62, 0xec, 0x63, 0xed, 0x64, 0xea, 0x65, 0xeb, 0x66, 0xe8, 0x67, 0xe9, 0x68, 0xe6, 0x69, 0xe7, 0x6a, 0xe4, 0x6b, 0xe5, 0x6c, 0xe2, 0x6d, 0xe3, 0x6e, 0xe0, 0x6f, 0xe1, 0x70, 0xfe, 0x71, 0xff, 0x72, 0xfc, 0x73, 0xfd, 0x74, 0xfa, 0x75, 0xfb, 0x76, 0xf8, 0x77, 0xf9, 0x78, 0xf6, 0x79, 0xf7, 0x7a, 0xf4, 0x7b, 0xf5, 0x7c, 0xf2, 0x7d, 0xf3, 0x7e, 0xf0, 0x7f, 0xf1},
+ {0x0, 0x8f, 0x3, 0x8c, 0x6, 0x89, 0x5, 0x8a, 0xc, 0x83, 0xf, 0x80, 0xa, 0x85, 0x9, 0x86, 0x18, 0x97, 0x1b, 0x94, 0x1e, 0x91, 0x1d, 0x92, 0x14, 0x9b, 0x17, 0x98, 0x12, 0x9d, 0x11, 0x9e, 0x30, 0xbf, 0x33, 0xbc, 0x36, 0xb9, 0x35, 0xba, 0x3c, 0xb3, 0x3f, 0xb0, 0x3a, 0xb5, 0x39, 0xb6, 0x28, 0xa7, 0x2b, 0xa4, 0x2e, 0xa1, 0x2d, 0xa2, 0x24, 0xab, 0x27, 0xa8, 0x22, 0xad, 0x21, 0xae, 0x60, 0xef, 0x63, 0xec, 0x66, 0xe9, 0x65, 0xea, 0x6c, 0xe3, 0x6f, 0xe0, 0x6a, 0xe5, 0x69, 0xe6, 0x78, 0xf7, 0x7b, 0xf4, 0x7e, 0xf1, 0x7d, 0xf2, 0x74, 0xfb, 0x77, 0xf8, 0x72, 0xfd, 0x71, 0xfe, 0x50, 0xdf, 0x53, 0xdc, 0x56, 0xd9, 0x55, 0xda, 0x5c, 0xd3, 0x5f, 0xd0, 0x5a, 0xd5, 0x59, 0xd6, 0x48, 0xc7, 0x4b, 0xc4, 0x4e, 0xc1, 0x4d, 0xc2, 0x44, 0xcb, 0x47, 0xc8, 0x42, 0xcd, 0x41, 0xce, 0xc0, 0x4f, 0xc3, 0x4c, 0xc6, 0x49, 0xc5, 0x4a, 0xcc, 0x43, 0xcf, 0x40, 0xca, 0x45, 0xc9, 0x46, 0xd8, 0x57, 0xdb, 0x54, 0xde, 0x51, 0xdd, 0x52, 0xd4, 0x5b, 0xd7, 0x58, 0xd2, 0x5d, 0xd1, 0x5e, 0xf0, 0x7f, 0xf3, 0x7c, 0xf6, 0x79, 0xf5, 0x7a, 0xfc, 0x73, 0xff, 0x70, 0xfa, 0x75, 0xf9, 0x76, 0xe8, 0x67, 0xeb, 0x64, 0xee, 0x61, 0xed, 0x62, 0xe4, 0x6b, 0xe7, 0x68, 0xe2, 0x6d, 0xe1, 0x6e, 0xa0, 0x2f, 0xa3, 0x2c, 0xa6, 0x29, 0xa5, 0x2a, 0xac, 0x23, 0xaf, 0x20, 0xaa, 0x25, 0xa9, 0x26, 0xb8, 0x37, 0xbb, 0x34, 0xbe, 0x31, 0xbd, 0x32, 0xb4, 0x3b, 0xb7, 0x38, 0xb2, 0x3d, 0xb1, 0x3e, 0x90, 0x1f, 0x93, 0x1c, 0x96, 0x19, 0x95, 0x1a, 0x9c, 0x13, 0x9f, 0x10, 0x9a, 0x15, 0x99, 0x16, 0x88, 0x7, 0x8b, 0x4, 0x8e, 0x1, 0x8d, 0x2, 0x84, 0xb, 0x87, 0x8, 0x82, 0xd, 0x81, 0xe},
+ {0x0, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23, 0xf5, 0x65, 0xc8, 0x58, 0x8f, 0x1f, 0xb2, 0x22, 0x1, 0x91, 0x3c, 0xac, 0x7b, 0xeb, 0x46, 0xd6, 0xf7, 0x67, 0xca, 0x5a, 0x8d, 0x1d, 0xb0, 0x20, 0x3, 0x93, 0x3e, 0xae, 0x79, 0xe9, 0x44, 0xd4, 0x2, 0x92, 0x3f, 0xaf, 0x78, 0xe8, 0x45, 0xd5, 0xf6, 0x66, 0xcb, 0x5b, 0x8c, 0x1c, 0xb1, 0x21, 0xf3, 0x63, 0xce, 0x5e, 0x89, 0x19, 0xb4, 0x24, 0x7, 0x97, 0x3a, 0xaa, 0x7d, 0xed, 0x40, 0xd0, 0x6, 0x96, 0x3b, 0xab, 0x7c, 0xec, 0x41, 0xd1, 0xf2, 0x62, 0xcf, 0x5f, 0x88, 0x18, 0xb5, 0x25, 0x4, 0x94, 0x39, 0xa9, 0x7e, 0xee, 0x43, 0xd3, 0xf0, 0x60, 0xcd, 0x5d, 0x8a, 0x1a, 0xb7, 0x27, 0xf1, 0x61, 0xcc, 0x5c, 0x8b, 0x1b, 0xb6, 0x26, 0x5, 0x95, 0x38, 0xa8, 0x7f, 0xef, 0x42, 0xd2, 0xfb, 0x6b, 0xc6, 0x56, 0x81, 0x11, 0xbc, 0x2c, 0xf, 0x9f, 0x32, 0xa2, 0x75, 0xe5, 0x48, 0xd8, 0xe, 0x9e, 0x33, 0xa3, 0x74, 0xe4, 0x49, 0xd9, 0xfa, 0x6a, 0xc7, 0x57, 0x80, 0x10, 0xbd, 0x2d, 0xc, 0x9c, 0x31, 0xa1, 0x76, 0xe6, 0x4b, 0xdb, 0xf8, 0x68, 0xc5, 0x55, 0x82, 0x12, 0xbf, 0x2f, 0xf9, 0x69, 0xc4, 0x54, 0x83, 0x13, 0xbe, 0x2e, 0xd, 0x9d, 0x30, 0xa0, 0x77, 0xe7, 0x4a, 0xda, 0x8, 0x98, 0x35, 0xa5, 0x72, 0xe2, 0x4f, 0xdf, 0xfc, 0x6c, 0xc1, 0x51, 0x86, 0x16, 0xbb, 0x2b, 0xfd, 0x6d, 0xc0, 0x50, 0x87, 0x17, 0xba, 0x2a, 0x9, 0x99, 0x34, 0xa4, 0x73, 0xe3, 0x4e, 0xde, 0xff, 0x6f, 0xc2, 0x52, 0x85, 0x15, 0xb8, 0x28, 0xb, 0x9b, 0x36, 0xa6, 0x71, 0xe1, 0x4c, 0xdc, 0xa, 0x9a, 0x37, 0xa7, 0x70, 0xe0, 0x4d, 0xdd, 0xfe, 0x6e, 0xc3, 0x53, 0x84, 0x14, 0xb9, 0x29},
+ {0x0, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c, 0xe5, 0x74, 0xda, 0x4b, 0x9b, 0xa, 0xa4, 0x35, 0x19, 0x88, 0x26, 0xb7, 0x67, 0xf6, 0x58, 0xc9, 0xd7, 0x46, 0xe8, 0x79, 0xa9, 0x38, 0x96, 0x7, 0x2b, 0xba, 0x14, 0x85, 0x55, 0xc4, 0x6a, 0xfb, 0x32, 0xa3, 0xd, 0x9c, 0x4c, 0xdd, 0x73, 0xe2, 0xce, 0x5f, 0xf1, 0x60, 0xb0, 0x21, 0x8f, 0x1e, 0xb3, 0x22, 0x8c, 0x1d, 0xcd, 0x5c, 0xf2, 0x63, 0x4f, 0xde, 0x70, 0xe1, 0x31, 0xa0, 0xe, 0x9f, 0x56, 0xc7, 0x69, 0xf8, 0x28, 0xb9, 0x17, 0x86, 0xaa, 0x3b, 0x95, 0x4, 0xd4, 0x45, 0xeb, 0x7a, 0x64, 0xf5, 0x5b, 0xca, 0x1a, 0x8b, 0x25, 0xb4, 0x98, 0x9, 0xa7, 0x36, 0xe6, 0x77, 0xd9, 0x48, 0x81, 0x10, 0xbe, 0x2f, 0xff, 0x6e, 0xc0, 0x51, 0x7d, 0xec, 0x42, 0xd3, 0x3, 0x92, 0x3c, 0xad, 0x7b, 0xea, 0x44, 0xd5, 0x5, 0x94, 0x3a, 0xab, 0x87, 0x16, 0xb8, 0x29, 0xf9, 0x68, 0xc6, 0x57, 0x9e, 0xf, 0xa1, 0x30, 0xe0, 0x71, 0xdf, 0x4e, 0x62, 0xf3, 0x5d, 0xcc, 0x1c, 0x8d, 0x23, 0xb2, 0xac, 0x3d, 0x93, 0x2, 0xd2, 0x43, 0xed, 0x7c, 0x50, 0xc1, 0x6f, 0xfe, 0x2e, 0xbf, 0x11, 0x80, 0x49, 0xd8, 0x76, 0xe7, 0x37, 0xa6, 0x8, 0x99, 0xb5, 0x24, 0x8a, 0x1b, 0xcb, 0x5a, 0xf4, 0x65, 0xc8, 0x59, 0xf7, 0x66, 0xb6, 0x27, 0x89, 0x18, 0x34, 0xa5, 0xb, 0x9a, 0x4a, 0xdb, 0x75, 0xe4, 0x2d, 0xbc, 0x12, 0x83, 0x53, 0xc2, 0x6c, 0xfd, 0xd1, 0x40, 0xee, 0x7f, 0xaf, 0x3e, 0x90, 0x1, 0x1f, 0x8e, 0x20, 0xb1, 0x61, 0xf0, 0x5e, 0xcf, 0xe3, 0x72, 0xdc, 0x4d, 0x9d, 0xc, 0xa2, 0x33, 0xfa, 0x6b, 0xc5, 0x54, 0x84, 0x15, 0xbb, 0x2a, 0x6, 0x97, 0x39, 0xa8, 0x78, 0xe9, 0x47, 0xd6},
+ {0x0, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x4, 0xaf, 0x3d, 0xd5, 0x47, 0xec, 0x7e, 0xa7, 0x35, 0x9e, 0xc, 0x31, 0xa3, 0x8, 0x9a, 0x43, 0xd1, 0x7a, 0xe8, 0xb7, 0x25, 0x8e, 0x1c, 0xc5, 0x57, 0xfc, 0x6e, 0x53, 0xc1, 0x6a, 0xf8, 0x21, 0xb3, 0x18, 0x8a, 0x62, 0xf0, 0x5b, 0xc9, 0x10, 0x82, 0x29, 0xbb, 0x86, 0x14, 0xbf, 0x2d, 0xf4, 0x66, 0xcd, 0x5f, 0x73, 0xe1, 0x4a, 0xd8, 0x1, 0x93, 0x38, 0xaa, 0x97, 0x5, 0xae, 0x3c, 0xe5, 0x77, 0xdc, 0x4e, 0xa6, 0x34, 0x9f, 0xd, 0xd4, 0x46, 0xed, 0x7f, 0x42, 0xd0, 0x7b, 0xe9, 0x30, 0xa2, 0x9, 0x9b, 0xc4, 0x56, 0xfd, 0x6f, 0xb6, 0x24, 0x8f, 0x1d, 0x20, 0xb2, 0x19, 0x8b, 0x52, 0xc0, 0x6b, 0xf9, 0x11, 0x83, 0x28, 0xba, 0x63, 0xf1, 0x5a, 0xc8, 0xf5, 0x67, 0xcc, 0x5e, 0x87, 0x15, 0xbe, 0x2c, 0xe6, 0x74, 0xdf, 0x4d, 0x94, 0x6, 0xad, 0x3f, 0x2, 0x90, 0x3b, 0xa9, 0x70, 0xe2, 0x49, 0xdb, 0x33, 0xa1, 0xa, 0x98, 0x41, 0xd3, 0x78, 0xea, 0xd7, 0x45, 0xee, 0x7c, 0xa5, 0x37, 0x9c, 0xe, 0x51, 0xc3, 0x68, 0xfa, 0x23, 0xb1, 0x1a, 0x88, 0xb5, 0x27, 0x8c, 0x1e, 0xc7, 0x55, 0xfe, 0x6c, 0x84, 0x16, 0xbd, 0x2f, 0xf6, 0x64, 0xcf, 0x5d, 0x60, 0xf2, 0x59, 0xcb, 0x12, 0x80, 0x2b, 0xb9, 0x95, 0x7, 0xac, 0x3e, 0xe7, 0x75, 0xde, 0x4c, 0x71, 0xe3, 0x48, 0xda, 0x3, 0x91, 0x3a, 0xa8, 0x40, 0xd2, 0x79, 0xeb, 0x32, 0xa0, 0xb, 0x99, 0xa4, 0x36, 0x9d, 0xf, 0xd6, 0x44, 0xef, 0x7d, 0x22, 0xb0, 0x1b, 0x89, 0x50, 0xc2, 0x69, 0xfb, 0xc6, 0x54, 0xff, 0x6d, 0xb4, 0x26, 0x8d, 0x1f, 0xf7, 0x65, 0xce, 0x5c, 0x85, 0x17, 0xbc, 0x2e, 0x13, 0x81, 0x2a, 0xb8, 0x61, 0xf3, 0x58, 0xca},
+ {0x0, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x9, 0xa1, 0x32, 0xc5, 0x56, 0xfe, 0x6d, 0xb3, 0x20, 0x88, 0x1b, 0x29, 0xba, 0x12, 0x81, 0x5f, 0xcc, 0x64, 0xf7, 0x97, 0x4, 0xac, 0x3f, 0xe1, 0x72, 0xda, 0x49, 0x7b, 0xe8, 0x40, 0xd3, 0xd, 0x9e, 0x36, 0xa5, 0x52, 0xc1, 0x69, 0xfa, 0x24, 0xb7, 0x1f, 0x8c, 0xbe, 0x2d, 0x85, 0x16, 0xc8, 0x5b, 0xf3, 0x60, 0x33, 0xa0, 0x8, 0x9b, 0x45, 0xd6, 0x7e, 0xed, 0xdf, 0x4c, 0xe4, 0x77, 0xa9, 0x3a, 0x92, 0x1, 0xf6, 0x65, 0xcd, 0x5e, 0x80, 0x13, 0xbb, 0x28, 0x1a, 0x89, 0x21, 0xb2, 0x6c, 0xff, 0x57, 0xc4, 0xa4, 0x37, 0x9f, 0xc, 0xd2, 0x41, 0xe9, 0x7a, 0x48, 0xdb, 0x73, 0xe0, 0x3e, 0xad, 0x5, 0x96, 0x61, 0xf2, 0x5a, 0xc9, 0x17, 0x84, 0x2c, 0xbf, 0x8d, 0x1e, 0xb6, 0x25, 0xfb, 0x68, 0xc0, 0x53, 0x66, 0xf5, 0x5d, 0xce, 0x10, 0x83, 0x2b, 0xb8, 0x8a, 0x19, 0xb1, 0x22, 0xfc, 0x6f, 0xc7, 0x54, 0xa3, 0x30, 0x98, 0xb, 0xd5, 0x46, 0xee, 0x7d, 0x4f, 0xdc, 0x74, 0xe7, 0x39, 0xaa, 0x2, 0x91, 0xf1, 0x62, 0xca, 0x59, 0x87, 0x14, 0xbc, 0x2f, 0x1d, 0x8e, 0x26, 0xb5, 0x6b, 0xf8, 0x50, 0xc3, 0x34, 0xa7, 0xf, 0x9c, 0x42, 0xd1, 0x79, 0xea, 0xd8, 0x4b, 0xe3, 0x70, 0xae, 0x3d, 0x95, 0x6, 0x55, 0xc6, 0x6e, 0xfd, 0x23, 0xb0, 0x18, 0x8b, 0xb9, 0x2a, 0x82, 0x11, 0xcf, 0x5c, 0xf4, 0x67, 0x90, 0x3, 0xab, 0x38, 0xe6, 0x75, 0xdd, 0x4e, 0x7c, 0xef, 0x47, 0xd4, 0xa, 0x99, 0x31, 0xa2, 0xc2, 0x51, 0xf9, 0x6a, 0xb4, 0x27, 0x8f, 0x1c, 0x2e, 0xbd, 0x15, 0x86, 0x58, 0xcb, 0x63, 0xf0, 0x7, 0x94, 0x3c, 0xaf, 0x71, 0xe2, 0x4a, 0xd9, 0xeb, 0x78, 0xd0, 0x43, 0x9d, 0xe, 0xa6, 0x35},
+ {0x0, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f, 0xb5, 0x21, 0x80, 0x14, 0xdf, 0x4b, 0xea, 0x7e, 0x61, 0xf5, 0x54, 0xc0, 0xb, 0x9f, 0x3e, 0xaa, 0x77, 0xe3, 0x42, 0xd6, 0x1d, 0x89, 0x28, 0xbc, 0xa3, 0x37, 0x96, 0x2, 0xc9, 0x5d, 0xfc, 0x68, 0xc2, 0x56, 0xf7, 0x63, 0xa8, 0x3c, 0x9d, 0x9, 0x16, 0x82, 0x23, 0xb7, 0x7c, 0xe8, 0x49, 0xdd, 0xee, 0x7a, 0xdb, 0x4f, 0x84, 0x10, 0xb1, 0x25, 0x3a, 0xae, 0xf, 0x9b, 0x50, 0xc4, 0x65, 0xf1, 0x5b, 0xcf, 0x6e, 0xfa, 0x31, 0xa5, 0x4, 0x90, 0x8f, 0x1b, 0xba, 0x2e, 0xe5, 0x71, 0xd0, 0x44, 0x99, 0xd, 0xac, 0x38, 0xf3, 0x67, 0xc6, 0x52, 0x4d, 0xd9, 0x78, 0xec, 0x27, 0xb3, 0x12, 0x86, 0x2c, 0xb8, 0x19, 0x8d, 0x46, 0xd2, 0x73, 0xe7, 0xf8, 0x6c, 0xcd, 0x59, 0x92, 0x6, 0xa7, 0x33, 0xc1, 0x55, 0xf4, 0x60, 0xab, 0x3f, 0x9e, 0xa, 0x15, 0x81, 0x20, 0xb4, 0x7f, 0xeb, 0x4a, 0xde, 0x74, 0xe0, 0x41, 0xd5, 0x1e, 0x8a, 0x2b, 0xbf, 0xa0, 0x34, 0x95, 0x1, 0xca, 0x5e, 0xff, 0x6b, 0xb6, 0x22, 0x83, 0x17, 0xdc, 0x48, 0xe9, 0x7d, 0x62, 0xf6, 0x57, 0xc3, 0x8, 0x9c, 0x3d, 0xa9, 0x3, 0x97, 0x36, 0xa2, 0x69, 0xfd, 0x5c, 0xc8, 0xd7, 0x43, 0xe2, 0x76, 0xbd, 0x29, 0x88, 0x1c, 0x2f, 0xbb, 0x1a, 0x8e, 0x45, 0xd1, 0x70, 0xe4, 0xfb, 0x6f, 0xce, 0x5a, 0x91, 0x5, 0xa4, 0x30, 0x9a, 0xe, 0xaf, 0x3b, 0xf0, 0x64, 0xc5, 0x51, 0x4e, 0xda, 0x7b, 0xef, 0x24, 0xb0, 0x11, 0x85, 0x58, 0xcc, 0x6d, 0xf9, 0x32, 0xa6, 0x7, 0x93, 0x8c, 0x18, 0xb9, 0x2d, 0xe6, 0x72, 0xd3, 0x47, 0xed, 0x79, 0xd8, 0x4c, 0x87, 0x13, 0xb2, 0x26, 0x39, 0xad, 0xc, 0x98, 0x53, 0xc7, 0x66, 0xf2},
+ {0x0, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10, 0xa5, 0x30, 0x92, 0x7, 0xcb, 0x5e, 0xfc, 0x69, 0x79, 0xec, 0x4e, 0xdb, 0x17, 0x82, 0x20, 0xb5, 0x57, 0xc2, 0x60, 0xf5, 0x39, 0xac, 0xe, 0x9b, 0x8b, 0x1e, 0xbc, 0x29, 0xe5, 0x70, 0xd2, 0x47, 0xf2, 0x67, 0xc5, 0x50, 0x9c, 0x9, 0xab, 0x3e, 0x2e, 0xbb, 0x19, 0x8c, 0x40, 0xd5, 0x77, 0xe2, 0xae, 0x3b, 0x99, 0xc, 0xc0, 0x55, 0xf7, 0x62, 0x72, 0xe7, 0x45, 0xd0, 0x1c, 0x89, 0x2b, 0xbe, 0xb, 0x9e, 0x3c, 0xa9, 0x65, 0xf0, 0x52, 0xc7, 0xd7, 0x42, 0xe0, 0x75, 0xb9, 0x2c, 0x8e, 0x1b, 0xf9, 0x6c, 0xce, 0x5b, 0x97, 0x2, 0xa0, 0x35, 0x25, 0xb0, 0x12, 0x87, 0x4b, 0xde, 0x7c, 0xe9, 0x5c, 0xc9, 0x6b, 0xfe, 0x32, 0xa7, 0x5, 0x90, 0x80, 0x15, 0xb7, 0x22, 0xee, 0x7b, 0xd9, 0x4c, 0x41, 0xd4, 0x76, 0xe3, 0x2f, 0xba, 0x18, 0x8d, 0x9d, 0x8, 0xaa, 0x3f, 0xf3, 0x66, 0xc4, 0x51, 0xe4, 0x71, 0xd3, 0x46, 0x8a, 0x1f, 0xbd, 0x28, 0x38, 0xad, 0xf, 0x9a, 0x56, 0xc3, 0x61, 0xf4, 0x16, 0x83, 0x21, 0xb4, 0x78, 0xed, 0x4f, 0xda, 0xca, 0x5f, 0xfd, 0x68, 0xa4, 0x31, 0x93, 0x6, 0xb3, 0x26, 0x84, 0x11, 0xdd, 0x48, 0xea, 0x7f, 0x6f, 0xfa, 0x58, 0xcd, 0x1, 0x94, 0x36, 0xa3, 0xef, 0x7a, 0xd8, 0x4d, 0x81, 0x14, 0xb6, 0x23, 0x33, 0xa6, 0x4, 0x91, 0x5d, 0xc8, 0x6a, 0xff, 0x4a, 0xdf, 0x7d, 0xe8, 0x24, 0xb1, 0x13, 0x86, 0x96, 0x3, 0xa1, 0x34, 0xf8, 0x6d, 0xcf, 0x5a, 0xb8, 0x2d, 0x8f, 0x1a, 0xd6, 0x43, 0xe1, 0x74, 0x64, 0xf1, 0x53, 0xc6, 0xa, 0x9f, 0x3d, 0xa8, 0x1d, 0x88, 0x2a, 0xbf, 0x73, 0xe6, 0x44, 0xd1, 0xc1, 0x54, 0xf6, 0x63, 0xaf, 0x3a, 0x98, 0xd},
+ {0x0, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x1, 0x95, 0x3, 0xa4, 0x32, 0xf7, 0x61, 0xc6, 0x50, 0x51, 0xc7, 0x60, 0xf6, 0x33, 0xa5, 0x2, 0x94, 0x37, 0xa1, 0x6, 0x90, 0x55, 0xc3, 0x64, 0xf2, 0xf3, 0x65, 0xc2, 0x54, 0x91, 0x7, 0xa0, 0x36, 0xa2, 0x34, 0x93, 0x5, 0xc0, 0x56, 0xf1, 0x67, 0x66, 0xf0, 0x57, 0xc1, 0x4, 0x92, 0x35, 0xa3, 0x6e, 0xf8, 0x5f, 0xc9, 0xc, 0x9a, 0x3d, 0xab, 0xaa, 0x3c, 0x9b, 0xd, 0xc8, 0x5e, 0xf9, 0x6f, 0xfb, 0x6d, 0xca, 0x5c, 0x99, 0xf, 0xa8, 0x3e, 0x3f, 0xa9, 0xe, 0x98, 0x5d, 0xcb, 0x6c, 0xfa, 0x59, 0xcf, 0x68, 0xfe, 0x3b, 0xad, 0xa, 0x9c, 0x9d, 0xb, 0xac, 0x3a, 0xff, 0x69, 0xce, 0x58, 0xcc, 0x5a, 0xfd, 0x6b, 0xae, 0x38, 0x9f, 0x9, 0x8, 0x9e, 0x39, 0xaf, 0x6a, 0xfc, 0x5b, 0xcd, 0xdc, 0x4a, 0xed, 0x7b, 0xbe, 0x28, 0x8f, 0x19, 0x18, 0x8e, 0x29, 0xbf, 0x7a, 0xec, 0x4b, 0xdd, 0x49, 0xdf, 0x78, 0xee, 0x2b, 0xbd, 0x1a, 0x8c, 0x8d, 0x1b, 0xbc, 0x2a, 0xef, 0x79, 0xde, 0x48, 0xeb, 0x7d, 0xda, 0x4c, 0x89, 0x1f, 0xb8, 0x2e, 0x2f, 0xb9, 0x1e, 0x88, 0x4d, 0xdb, 0x7c, 0xea, 0x7e, 0xe8, 0x4f, 0xd9, 0x1c, 0x8a, 0x2d, 0xbb, 0xba, 0x2c, 0x8b, 0x1d, 0xd8, 0x4e, 0xe9, 0x7f, 0xb2, 0x24, 0x83, 0x15, 0xd0, 0x46, 0xe1, 0x77, 0x76, 0xe0, 0x47, 0xd1, 0x14, 0x82, 0x25, 0xb3, 0x27, 0xb1, 0x16, 0x80, 0x45, 0xd3, 0x74, 0xe2, 0xe3, 0x75, 0xd2, 0x44, 0x81, 0x17, 0xb0, 0x26, 0x85, 0x13, 0xb4, 0x22, 0xe7, 0x71, 0xd6, 0x40, 0x41, 0xd7, 0x70, 0xe6, 0x23, 0xb5, 0x12, 0x84, 0x10, 0x86, 0x21, 0xb7, 0x72, 0xe4, 0x43, 0xd5, 0xd4, 0x42, 0xe5, 0x73, 0xb6, 0x20, 0x87, 0x11},
+ {0x0, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0xe, 0x85, 0x12, 0xb6, 0x21, 0xe3, 0x74, 0xd0, 0x47, 0x49, 0xde, 0x7a, 0xed, 0x2f, 0xb8, 0x1c, 0x8b, 0x17, 0x80, 0x24, 0xb3, 0x71, 0xe6, 0x42, 0xd5, 0xdb, 0x4c, 0xe8, 0x7f, 0xbd, 0x2a, 0x8e, 0x19, 0x92, 0x5, 0xa1, 0x36, 0xf4, 0x63, 0xc7, 0x50, 0x5e, 0xc9, 0x6d, 0xfa, 0x38, 0xaf, 0xb, 0x9c, 0x2e, 0xb9, 0x1d, 0x8a, 0x48, 0xdf, 0x7b, 0xec, 0xe2, 0x75, 0xd1, 0x46, 0x84, 0x13, 0xb7, 0x20, 0xab, 0x3c, 0x98, 0xf, 0xcd, 0x5a, 0xfe, 0x69, 0x67, 0xf0, 0x54, 0xc3, 0x1, 0x96, 0x32, 0xa5, 0x39, 0xae, 0xa, 0x9d, 0x5f, 0xc8, 0x6c, 0xfb, 0xf5, 0x62, 0xc6, 0x51, 0x93, 0x4, 0xa0, 0x37, 0xbc, 0x2b, 0x8f, 0x18, 0xda, 0x4d, 0xe9, 0x7e, 0x70, 0xe7, 0x43, 0xd4, 0x16, 0x81, 0x25, 0xb2, 0x5c, 0xcb, 0x6f, 0xf8, 0x3a, 0xad, 0x9, 0x9e, 0x90, 0x7, 0xa3, 0x34, 0xf6, 0x61, 0xc5, 0x52, 0xd9, 0x4e, 0xea, 0x7d, 0xbf, 0x28, 0x8c, 0x1b, 0x15, 0x82, 0x26, 0xb1, 0x73, 0xe4, 0x40, 0xd7, 0x4b, 0xdc, 0x78, 0xef, 0x2d, 0xba, 0x1e, 0x89, 0x87, 0x10, 0xb4, 0x23, 0xe1, 0x76, 0xd2, 0x45, 0xce, 0x59, 0xfd, 0x6a, 0xa8, 0x3f, 0x9b, 0xc, 0x2, 0x95, 0x31, 0xa6, 0x64, 0xf3, 0x57, 0xc0, 0x72, 0xe5, 0x41, 0xd6, 0x14, 0x83, 0x27, 0xb0, 0xbe, 0x29, 0x8d, 0x1a, 0xd8, 0x4f, 0xeb, 0x7c, 0xf7, 0x60, 0xc4, 0x53, 0x91, 0x6, 0xa2, 0x35, 0x3b, 0xac, 0x8, 0x9f, 0x5d, 0xca, 0x6e, 0xf9, 0x65, 0xf2, 0x56, 0xc1, 0x3, 0x94, 0x30, 0xa7, 0xa9, 0x3e, 0x9a, 0xd, 0xcf, 0x58, 0xfc, 0x6b, 0xe0, 0x77, 0xd3, 0x44, 0x86, 0x11, 0xb5, 0x22, 0x2c, 0xbb, 0x1f, 0x88, 0x4a, 0xdd, 0x79, 0xee},
+ {0x0, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x1, 0xee, 0x76, 0xc3, 0x5b, 0x75, 0xed, 0x58, 0xc0, 0x2f, 0xb7, 0x2, 0x9a, 0xc1, 0x59, 0xec, 0x74, 0x9b, 0x3, 0xb6, 0x2e, 0xea, 0x72, 0xc7, 0x5f, 0xb0, 0x28, 0x9d, 0x5, 0x5e, 0xc6, 0x73, 0xeb, 0x4, 0x9c, 0x29, 0xb1, 0x9f, 0x7, 0xb2, 0x2a, 0xc5, 0x5d, 0xe8, 0x70, 0x2b, 0xb3, 0x6, 0x9e, 0x71, 0xe9, 0x5c, 0xc4, 0xc9, 0x51, 0xe4, 0x7c, 0x93, 0xb, 0xbe, 0x26, 0x7d, 0xe5, 0x50, 0xc8, 0x27, 0xbf, 0xa, 0x92, 0xbc, 0x24, 0x91, 0x9, 0xe6, 0x7e, 0xcb, 0x53, 0x8, 0x90, 0x25, 0xbd, 0x52, 0xca, 0x7f, 0xe7, 0x23, 0xbb, 0xe, 0x96, 0x79, 0xe1, 0x54, 0xcc, 0x97, 0xf, 0xba, 0x22, 0xcd, 0x55, 0xe0, 0x78, 0x56, 0xce, 0x7b, 0xe3, 0xc, 0x94, 0x21, 0xb9, 0xe2, 0x7a, 0xcf, 0x57, 0xb8, 0x20, 0x95, 0xd, 0x8f, 0x17, 0xa2, 0x3a, 0xd5, 0x4d, 0xf8, 0x60, 0x3b, 0xa3, 0x16, 0x8e, 0x61, 0xf9, 0x4c, 0xd4, 0xfa, 0x62, 0xd7, 0x4f, 0xa0, 0x38, 0x8d, 0x15, 0x4e, 0xd6, 0x63, 0xfb, 0x14, 0x8c, 0x39, 0xa1, 0x65, 0xfd, 0x48, 0xd0, 0x3f, 0xa7, 0x12, 0x8a, 0xd1, 0x49, 0xfc, 0x64, 0x8b, 0x13, 0xa6, 0x3e, 0x10, 0x88, 0x3d, 0xa5, 0x4a, 0xd2, 0x67, 0xff, 0xa4, 0x3c, 0x89, 0x11, 0xfe, 0x66, 0xd3, 0x4b, 0x46, 0xde, 0x6b, 0xf3, 0x1c, 0x84, 0x31, 0xa9, 0xf2, 0x6a, 0xdf, 0x47, 0xa8, 0x30, 0x85, 0x1d, 0x33, 0xab, 0x1e, 0x86, 0x69, 0xf1, 0x44, 0xdc, 0x87, 0x1f, 0xaa, 0x32, 0xdd, 0x45, 0xf0, 0x68, 0xac, 0x34, 0x81, 0x19, 0xf6, 0x6e, 0xdb, 0x43, 0x18, 0x80, 0x35, 0xad, 0x42, 0xda, 0x6f, 0xf7, 0xd9, 0x41, 0xf4, 0x6c, 0x83, 0x1b, 0xae, 0x36, 0x6d, 0xf5, 0x40, 0xd8, 0x37, 0xaf, 0x1a, 0x82},
+ {0x0, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0xa, 0xe2, 0x7b, 0xcd, 0x54, 0x65, 0xfc, 0x4a, 0xd3, 0x3b, 0xa2, 0x14, 0x8d, 0xd9, 0x40, 0xf6, 0x6f, 0x87, 0x1e, 0xa8, 0x31, 0xca, 0x53, 0xe5, 0x7c, 0x94, 0xd, 0xbb, 0x22, 0x76, 0xef, 0x59, 0xc0, 0x28, 0xb1, 0x7, 0x9e, 0xaf, 0x36, 0x80, 0x19, 0xf1, 0x68, 0xde, 0x47, 0x13, 0x8a, 0x3c, 0xa5, 0x4d, 0xd4, 0x62, 0xfb, 0x89, 0x10, 0xa6, 0x3f, 0xd7, 0x4e, 0xf8, 0x61, 0x35, 0xac, 0x1a, 0x83, 0x6b, 0xf2, 0x44, 0xdd, 0xec, 0x75, 0xc3, 0x5a, 0xb2, 0x2b, 0x9d, 0x4, 0x50, 0xc9, 0x7f, 0xe6, 0xe, 0x97, 0x21, 0xb8, 0x43, 0xda, 0x6c, 0xf5, 0x1d, 0x84, 0x32, 0xab, 0xff, 0x66, 0xd0, 0x49, 0xa1, 0x38, 0x8e, 0x17, 0x26, 0xbf, 0x9, 0x90, 0x78, 0xe1, 0x57, 0xce, 0x9a, 0x3, 0xb5, 0x2c, 0xc4, 0x5d, 0xeb, 0x72, 0xf, 0x96, 0x20, 0xb9, 0x51, 0xc8, 0x7e, 0xe7, 0xb3, 0x2a, 0x9c, 0x5, 0xed, 0x74, 0xc2, 0x5b, 0x6a, 0xf3, 0x45, 0xdc, 0x34, 0xad, 0x1b, 0x82, 0xd6, 0x4f, 0xf9, 0x60, 0x88, 0x11, 0xa7, 0x3e, 0xc5, 0x5c, 0xea, 0x73, 0x9b, 0x2, 0xb4, 0x2d, 0x79, 0xe0, 0x56, 0xcf, 0x27, 0xbe, 0x8, 0x91, 0xa0, 0x39, 0x8f, 0x16, 0xfe, 0x67, 0xd1, 0x48, 0x1c, 0x85, 0x33, 0xaa, 0x42, 0xdb, 0x6d, 0xf4, 0x86, 0x1f, 0xa9, 0x30, 0xd8, 0x41, 0xf7, 0x6e, 0x3a, 0xa3, 0x15, 0x8c, 0x64, 0xfd, 0x4b, 0xd2, 0xe3, 0x7a, 0xcc, 0x55, 0xbd, 0x24, 0x92, 0xb, 0x5f, 0xc6, 0x70, 0xe9, 0x1, 0x98, 0x2e, 0xb7, 0x4c, 0xd5, 0x63, 0xfa, 0x12, 0x8b, 0x3d, 0xa4, 0xf0, 0x69, 0xdf, 0x46, 0xae, 0x37, 0x81, 0x18, 0x29, 0xb0, 0x6, 0x9f, 0x77, 0xee, 0x58, 0xc1, 0x95, 0xc, 0xba, 0x23, 0xcb, 0x52, 0xe4, 0x7d},
+ {0x0, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45, 0x55, 0xcf, 0x7c, 0xe6, 0x7, 0x9d, 0x2e, 0xb4, 0xf1, 0x6b, 0xd8, 0x42, 0xa3, 0x39, 0x8a, 0x10, 0xaa, 0x30, 0x83, 0x19, 0xf8, 0x62, 0xd1, 0x4b, 0xe, 0x94, 0x27, 0xbd, 0x5c, 0xc6, 0x75, 0xef, 0xff, 0x65, 0xd6, 0x4c, 0xad, 0x37, 0x84, 0x1e, 0x5b, 0xc1, 0x72, 0xe8, 0x9, 0x93, 0x20, 0xba, 0x49, 0xd3, 0x60, 0xfa, 0x1b, 0x81, 0x32, 0xa8, 0xed, 0x77, 0xc4, 0x5e, 0xbf, 0x25, 0x96, 0xc, 0x1c, 0x86, 0x35, 0xaf, 0x4e, 0xd4, 0x67, 0xfd, 0xb8, 0x22, 0x91, 0xb, 0xea, 0x70, 0xc3, 0x59, 0xe3, 0x79, 0xca, 0x50, 0xb1, 0x2b, 0x98, 0x2, 0x47, 0xdd, 0x6e, 0xf4, 0x15, 0x8f, 0x3c, 0xa6, 0xb6, 0x2c, 0x9f, 0x5, 0xe4, 0x7e, 0xcd, 0x57, 0x12, 0x88, 0x3b, 0xa1, 0x40, 0xda, 0x69, 0xf3, 0x92, 0x8, 0xbb, 0x21, 0xc0, 0x5a, 0xe9, 0x73, 0x36, 0xac, 0x1f, 0x85, 0x64, 0xfe, 0x4d, 0xd7, 0xc7, 0x5d, 0xee, 0x74, 0x95, 0xf, 0xbc, 0x26, 0x63, 0xf9, 0x4a, 0xd0, 0x31, 0xab, 0x18, 0x82, 0x38, 0xa2, 0x11, 0x8b, 0x6a, 0xf0, 0x43, 0xd9, 0x9c, 0x6, 0xb5, 0x2f, 0xce, 0x54, 0xe7, 0x7d, 0x6d, 0xf7, 0x44, 0xde, 0x3f, 0xa5, 0x16, 0x8c, 0xc9, 0x53, 0xe0, 0x7a, 0x9b, 0x1, 0xb2, 0x28, 0xdb, 0x41, 0xf2, 0x68, 0x89, 0x13, 0xa0, 0x3a, 0x7f, 0xe5, 0x56, 0xcc, 0x2d, 0xb7, 0x4, 0x9e, 0x8e, 0x14, 0xa7, 0x3d, 0xdc, 0x46, 0xf5, 0x6f, 0x2a, 0xb0, 0x3, 0x99, 0x78, 0xe2, 0x51, 0xcb, 0x71, 0xeb, 0x58, 0xc2, 0x23, 0xb9, 0xa, 0x90, 0xd5, 0x4f, 0xfc, 0x66, 0x87, 0x1d, 0xae, 0x34, 0x24, 0xbe, 0xd, 0x97, 0x76, 0xec, 0x5f, 0xc5, 0x80, 0x1a, 0xa9, 0x33, 0xd2, 0x48, 0xfb, 0x61},
+ {0x0, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a, 0x45, 0xde, 0x6e, 0xf5, 0x13, 0x88, 0x38, 0xa3, 0xe9, 0x72, 0xc2, 0x59, 0xbf, 0x24, 0x94, 0xf, 0x8a, 0x11, 0xa1, 0x3a, 0xdc, 0x47, 0xf7, 0x6c, 0x26, 0xbd, 0xd, 0x96, 0x70, 0xeb, 0x5b, 0xc0, 0xcf, 0x54, 0xe4, 0x7f, 0x99, 0x2, 0xb2, 0x29, 0x63, 0xf8, 0x48, 0xd3, 0x35, 0xae, 0x1e, 0x85, 0x9, 0x92, 0x22, 0xb9, 0x5f, 0xc4, 0x74, 0xef, 0xa5, 0x3e, 0x8e, 0x15, 0xf3, 0x68, 0xd8, 0x43, 0x4c, 0xd7, 0x67, 0xfc, 0x1a, 0x81, 0x31, 0xaa, 0xe0, 0x7b, 0xcb, 0x50, 0xb6, 0x2d, 0x9d, 0x6, 0x83, 0x18, 0xa8, 0x33, 0xd5, 0x4e, 0xfe, 0x65, 0x2f, 0xb4, 0x4, 0x9f, 0x79, 0xe2, 0x52, 0xc9, 0xc6, 0x5d, 0xed, 0x76, 0x90, 0xb, 0xbb, 0x20, 0x6a, 0xf1, 0x41, 0xda, 0x3c, 0xa7, 0x17, 0x8c, 0x12, 0x89, 0x39, 0xa2, 0x44, 0xdf, 0x6f, 0xf4, 0xbe, 0x25, 0x95, 0xe, 0xe8, 0x73, 0xc3, 0x58, 0x57, 0xcc, 0x7c, 0xe7, 0x1, 0x9a, 0x2a, 0xb1, 0xfb, 0x60, 0xd0, 0x4b, 0xad, 0x36, 0x86, 0x1d, 0x98, 0x3, 0xb3, 0x28, 0xce, 0x55, 0xe5, 0x7e, 0x34, 0xaf, 0x1f, 0x84, 0x62, 0xf9, 0x49, 0xd2, 0xdd, 0x46, 0xf6, 0x6d, 0x8b, 0x10, 0xa0, 0x3b, 0x71, 0xea, 0x5a, 0xc1, 0x27, 0xbc, 0xc, 0x97, 0x1b, 0x80, 0x30, 0xab, 0x4d, 0xd6, 0x66, 0xfd, 0xb7, 0x2c, 0x9c, 0x7, 0xe1, 0x7a, 0xca, 0x51, 0x5e, 0xc5, 0x75, 0xee, 0x8, 0x93, 0x23, 0xb8, 0xf2, 0x69, 0xd9, 0x42, 0xa4, 0x3f, 0x8f, 0x14, 0x91, 0xa, 0xba, 0x21, 0xc7, 0x5c, 0xec, 0x77, 0x3d, 0xa6, 0x16, 0x8d, 0x6b, 0xf0, 0x40, 0xdb, 0xd4, 0x4f, 0xff, 0x64, 0x82, 0x19, 0xa9, 0x32, 0x78, 0xe3, 0x53, 0xc8, 0x2e, 0xb5, 0x5, 0x9e},
+ {0x0, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x8, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67, 0x35, 0xa9, 0x10, 0x8c, 0x7f, 0xe3, 0x5a, 0xc6, 0xa1, 0x3d, 0x84, 0x18, 0xeb, 0x77, 0xce, 0x52, 0x6a, 0xf6, 0x4f, 0xd3, 0x20, 0xbc, 0x5, 0x99, 0xfe, 0x62, 0xdb, 0x47, 0xb4, 0x28, 0x91, 0xd, 0x5f, 0xc3, 0x7a, 0xe6, 0x15, 0x89, 0x30, 0xac, 0xcb, 0x57, 0xee, 0x72, 0x81, 0x1d, 0xa4, 0x38, 0xd4, 0x48, 0xf1, 0x6d, 0x9e, 0x2, 0xbb, 0x27, 0x40, 0xdc, 0x65, 0xf9, 0xa, 0x96, 0x2f, 0xb3, 0xe1, 0x7d, 0xc4, 0x58, 0xab, 0x37, 0x8e, 0x12, 0x75, 0xe9, 0x50, 0xcc, 0x3f, 0xa3, 0x1a, 0x86, 0xbe, 0x22, 0x9b, 0x7, 0xf4, 0x68, 0xd1, 0x4d, 0x2a, 0xb6, 0xf, 0x93, 0x60, 0xfc, 0x45, 0xd9, 0x8b, 0x17, 0xae, 0x32, 0xc1, 0x5d, 0xe4, 0x78, 0x1f, 0x83, 0x3a, 0xa6, 0x55, 0xc9, 0x70, 0xec, 0xb5, 0x29, 0x90, 0xc, 0xff, 0x63, 0xda, 0x46, 0x21, 0xbd, 0x4, 0x98, 0x6b, 0xf7, 0x4e, 0xd2, 0x80, 0x1c, 0xa5, 0x39, 0xca, 0x56, 0xef, 0x73, 0x14, 0x88, 0x31, 0xad, 0x5e, 0xc2, 0x7b, 0xe7, 0xdf, 0x43, 0xfa, 0x66, 0x95, 0x9, 0xb0, 0x2c, 0x4b, 0xd7, 0x6e, 0xf2, 0x1, 0x9d, 0x24, 0xb8, 0xea, 0x76, 0xcf, 0x53, 0xa0, 0x3c, 0x85, 0x19, 0x7e, 0xe2, 0x5b, 0xc7, 0x34, 0xa8, 0x11, 0x8d, 0x61, 0xfd, 0x44, 0xd8, 0x2b, 0xb7, 0xe, 0x92, 0xf5, 0x69, 0xd0, 0x4c, 0xbf, 0x23, 0x9a, 0x6, 0x54, 0xc8, 0x71, 0xed, 0x1e, 0x82, 0x3b, 0xa7, 0xc0, 0x5c, 0xe5, 0x79, 0x8a, 0x16, 0xaf, 0x33, 0xb, 0x97, 0x2e, 0xb2, 0x41, 0xdd, 0x64, 0xf8, 0x9f, 0x3, 0xba, 0x26, 0xd5, 0x49, 0xf0, 0x6c, 0x3e, 0xa2, 0x1b, 0x87, 0x74, 0xe8, 0x51, 0xcd, 0xaa, 0x36, 0x8f, 0x13, 0xe0, 0x7c, 0xc5, 0x59},
+ {0x0, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x1, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68, 0x25, 0xb8, 0x2, 0x9f, 0x6b, 0xf6, 0x4c, 0xd1, 0xb9, 0x24, 0x9e, 0x3, 0xf7, 0x6a, 0xd0, 0x4d, 0x4a, 0xd7, 0x6d, 0xf0, 0x4, 0x99, 0x23, 0xbe, 0xd6, 0x4b, 0xf1, 0x6c, 0x98, 0x5, 0xbf, 0x22, 0x6f, 0xf2, 0x48, 0xd5, 0x21, 0xbc, 0x6, 0x9b, 0xf3, 0x6e, 0xd4, 0x49, 0xbd, 0x20, 0x9a, 0x7, 0x94, 0x9, 0xb3, 0x2e, 0xda, 0x47, 0xfd, 0x60, 0x8, 0x95, 0x2f, 0xb2, 0x46, 0xdb, 0x61, 0xfc, 0xb1, 0x2c, 0x96, 0xb, 0xff, 0x62, 0xd8, 0x45, 0x2d, 0xb0, 0xa, 0x97, 0x63, 0xfe, 0x44, 0xd9, 0xde, 0x43, 0xf9, 0x64, 0x90, 0xd, 0xb7, 0x2a, 0x42, 0xdf, 0x65, 0xf8, 0xc, 0x91, 0x2b, 0xb6, 0xfb, 0x66, 0xdc, 0x41, 0xb5, 0x28, 0x92, 0xf, 0x67, 0xfa, 0x40, 0xdd, 0x29, 0xb4, 0xe, 0x93, 0x35, 0xa8, 0x12, 0x8f, 0x7b, 0xe6, 0x5c, 0xc1, 0xa9, 0x34, 0x8e, 0x13, 0xe7, 0x7a, 0xc0, 0x5d, 0x10, 0x8d, 0x37, 0xaa, 0x5e, 0xc3, 0x79, 0xe4, 0x8c, 0x11, 0xab, 0x36, 0xc2, 0x5f, 0xe5, 0x78, 0x7f, 0xe2, 0x58, 0xc5, 0x31, 0xac, 0x16, 0x8b, 0xe3, 0x7e, 0xc4, 0x59, 0xad, 0x30, 0x8a, 0x17, 0x5a, 0xc7, 0x7d, 0xe0, 0x14, 0x89, 0x33, 0xae, 0xc6, 0x5b, 0xe1, 0x7c, 0x88, 0x15, 0xaf, 0x32, 0xa1, 0x3c, 0x86, 0x1b, 0xef, 0x72, 0xc8, 0x55, 0x3d, 0xa0, 0x1a, 0x87, 0x73, 0xee, 0x54, 0xc9, 0x84, 0x19, 0xa3, 0x3e, 0xca, 0x57, 0xed, 0x70, 0x18, 0x85, 0x3f, 0xa2, 0x56, 0xcb, 0x71, 0xec, 0xeb, 0x76, 0xcc, 0x51, 0xa5, 0x38, 0x82, 0x1f, 0x77, 0xea, 0x50, 0xcd, 0x39, 0xa4, 0x1e, 0x83, 0xce, 0x53, 0xe9, 0x74, 0x80, 0x1d, 0xa7, 0x3a, 0x52, 0xcf, 0x75, 0xe8, 0x1c, 0x81, 0x3b, 0xa6},
+ {0x0, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79, 0x15, 0x8b, 0x34, 0xaa, 0x57, 0xc9, 0x76, 0xe8, 0x91, 0xf, 0xb0, 0x2e, 0xd3, 0x4d, 0xf2, 0x6c, 0x2a, 0xb4, 0xb, 0x95, 0x68, 0xf6, 0x49, 0xd7, 0xae, 0x30, 0x8f, 0x11, 0xec, 0x72, 0xcd, 0x53, 0x3f, 0xa1, 0x1e, 0x80, 0x7d, 0xe3, 0x5c, 0xc2, 0xbb, 0x25, 0x9a, 0x4, 0xf9, 0x67, 0xd8, 0x46, 0x54, 0xca, 0x75, 0xeb, 0x16, 0x88, 0x37, 0xa9, 0xd0, 0x4e, 0xf1, 0x6f, 0x92, 0xc, 0xb3, 0x2d, 0x41, 0xdf, 0x60, 0xfe, 0x3, 0x9d, 0x22, 0xbc, 0xc5, 0x5b, 0xe4, 0x7a, 0x87, 0x19, 0xa6, 0x38, 0x7e, 0xe0, 0x5f, 0xc1, 0x3c, 0xa2, 0x1d, 0x83, 0xfa, 0x64, 0xdb, 0x45, 0xb8, 0x26, 0x99, 0x7, 0x6b, 0xf5, 0x4a, 0xd4, 0x29, 0xb7, 0x8, 0x96, 0xef, 0x71, 0xce, 0x50, 0xad, 0x33, 0x8c, 0x12, 0xa8, 0x36, 0x89, 0x17, 0xea, 0x74, 0xcb, 0x55, 0x2c, 0xb2, 0xd, 0x93, 0x6e, 0xf0, 0x4f, 0xd1, 0xbd, 0x23, 0x9c, 0x2, 0xff, 0x61, 0xde, 0x40, 0x39, 0xa7, 0x18, 0x86, 0x7b, 0xe5, 0x5a, 0xc4, 0x82, 0x1c, 0xa3, 0x3d, 0xc0, 0x5e, 0xe1, 0x7f, 0x6, 0x98, 0x27, 0xb9, 0x44, 0xda, 0x65, 0xfb, 0x97, 0x9, 0xb6, 0x28, 0xd5, 0x4b, 0xf4, 0x6a, 0x13, 0x8d, 0x32, 0xac, 0x51, 0xcf, 0x70, 0xee, 0xfc, 0x62, 0xdd, 0x43, 0xbe, 0x20, 0x9f, 0x1, 0x78, 0xe6, 0x59, 0xc7, 0x3a, 0xa4, 0x1b, 0x85, 0xe9, 0x77, 0xc8, 0x56, 0xab, 0x35, 0x8a, 0x14, 0x6d, 0xf3, 0x4c, 0xd2, 0x2f, 0xb1, 0xe, 0x90, 0xd6, 0x48, 0xf7, 0x69, 0x94, 0xa, 0xb5, 0x2b, 0x52, 0xcc, 0x73, 0xed, 0x10, 0x8e, 0x31, 0xaf, 0xc3, 0x5d, 0xe2, 0x7c, 0x81, 0x1f, 0xa0, 0x3e, 0x47, 0xd9, 0x66, 0xf8, 0x5, 0x9b, 0x24, 0xba},
+ {0x0, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76, 0x5, 0x9a, 0x26, 0xb9, 0x43, 0xdc, 0x60, 0xff, 0x89, 0x16, 0xaa, 0x35, 0xcf, 0x50, 0xec, 0x73, 0xa, 0x95, 0x29, 0xb6, 0x4c, 0xd3, 0x6f, 0xf0, 0x86, 0x19, 0xa5, 0x3a, 0xc0, 0x5f, 0xe3, 0x7c, 0xf, 0x90, 0x2c, 0xb3, 0x49, 0xd6, 0x6a, 0xf5, 0x83, 0x1c, 0xa0, 0x3f, 0xc5, 0x5a, 0xe6, 0x79, 0x14, 0x8b, 0x37, 0xa8, 0x52, 0xcd, 0x71, 0xee, 0x98, 0x7, 0xbb, 0x24, 0xde, 0x41, 0xfd, 0x62, 0x11, 0x8e, 0x32, 0xad, 0x57, 0xc8, 0x74, 0xeb, 0x9d, 0x2, 0xbe, 0x21, 0xdb, 0x44, 0xf8, 0x67, 0x1e, 0x81, 0x3d, 0xa2, 0x58, 0xc7, 0x7b, 0xe4, 0x92, 0xd, 0xb1, 0x2e, 0xd4, 0x4b, 0xf7, 0x68, 0x1b, 0x84, 0x38, 0xa7, 0x5d, 0xc2, 0x7e, 0xe1, 0x97, 0x8, 0xb4, 0x2b, 0xd1, 0x4e, 0xf2, 0x6d, 0x28, 0xb7, 0xb, 0x94, 0x6e, 0xf1, 0x4d, 0xd2, 0xa4, 0x3b, 0x87, 0x18, 0xe2, 0x7d, 0xc1, 0x5e, 0x2d, 0xb2, 0xe, 0x91, 0x6b, 0xf4, 0x48, 0xd7, 0xa1, 0x3e, 0x82, 0x1d, 0xe7, 0x78, 0xc4, 0x5b, 0x22, 0xbd, 0x1, 0x9e, 0x64, 0xfb, 0x47, 0xd8, 0xae, 0x31, 0x8d, 0x12, 0xe8, 0x77, 0xcb, 0x54, 0x27, 0xb8, 0x4, 0x9b, 0x61, 0xfe, 0x42, 0xdd, 0xab, 0x34, 0x88, 0x17, 0xed, 0x72, 0xce, 0x51, 0x3c, 0xa3, 0x1f, 0x80, 0x7a, 0xe5, 0x59, 0xc6, 0xb0, 0x2f, 0x93, 0xc, 0xf6, 0x69, 0xd5, 0x4a, 0x39, 0xa6, 0x1a, 0x85, 0x7f, 0xe0, 0x5c, 0xc3, 0xb5, 0x2a, 0x96, 0x9, 0xf3, 0x6c, 0xd0, 0x4f, 0x36, 0xa9, 0x15, 0x8a, 0x70, 0xef, 0x53, 0xcc, 0xba, 0x25, 0x99, 0x6, 0xfc, 0x63, 0xdf, 0x40, 0x33, 0xac, 0x10, 0x8f, 0x75, 0xea, 0x56, 0xc9, 0xbf, 0x20, 0x9c, 0x3, 0xf9, 0x66, 0xda, 0x45},
+ {0x0, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e, 0xd2, 0x72, 0x8f, 0x2f, 0x68, 0xc8, 0x35, 0x95, 0xbb, 0x1b, 0xe6, 0x46, 0x1, 0xa1, 0x5c, 0xfc, 0xb9, 0x19, 0xe4, 0x44, 0x3, 0xa3, 0x5e, 0xfe, 0xd0, 0x70, 0x8d, 0x2d, 0x6a, 0xca, 0x37, 0x97, 0x6b, 0xcb, 0x36, 0x96, 0xd1, 0x71, 0x8c, 0x2c, 0x2, 0xa2, 0x5f, 0xff, 0xb8, 0x18, 0xe5, 0x45, 0x6f, 0xcf, 0x32, 0x92, 0xd5, 0x75, 0x88, 0x28, 0x6, 0xa6, 0x5b, 0xfb, 0xbc, 0x1c, 0xe1, 0x41, 0xbd, 0x1d, 0xe0, 0x40, 0x7, 0xa7, 0x5a, 0xfa, 0xd4, 0x74, 0x89, 0x29, 0x6e, 0xce, 0x33, 0x93, 0xd6, 0x76, 0x8b, 0x2b, 0x6c, 0xcc, 0x31, 0x91, 0xbf, 0x1f, 0xe2, 0x42, 0x5, 0xa5, 0x58, 0xf8, 0x4, 0xa4, 0x59, 0xf9, 0xbe, 0x1e, 0xe3, 0x43, 0x6d, 0xcd, 0x30, 0x90, 0xd7, 0x77, 0x8a, 0x2a, 0xde, 0x7e, 0x83, 0x23, 0x64, 0xc4, 0x39, 0x99, 0xb7, 0x17, 0xea, 0x4a, 0xd, 0xad, 0x50, 0xf0, 0xc, 0xac, 0x51, 0xf1, 0xb6, 0x16, 0xeb, 0x4b, 0x65, 0xc5, 0x38, 0x98, 0xdf, 0x7f, 0x82, 0x22, 0x67, 0xc7, 0x3a, 0x9a, 0xdd, 0x7d, 0x80, 0x20, 0xe, 0xae, 0x53, 0xf3, 0xb4, 0x14, 0xe9, 0x49, 0xb5, 0x15, 0xe8, 0x48, 0xf, 0xaf, 0x52, 0xf2, 0xdc, 0x7c, 0x81, 0x21, 0x66, 0xc6, 0x3b, 0x9b, 0xb1, 0x11, 0xec, 0x4c, 0xb, 0xab, 0x56, 0xf6, 0xd8, 0x78, 0x85, 0x25, 0x62, 0xc2, 0x3f, 0x9f, 0x63, 0xc3, 0x3e, 0x9e, 0xd9, 0x79, 0x84, 0x24, 0xa, 0xaa, 0x57, 0xf7, 0xb0, 0x10, 0xed, 0x4d, 0x8, 0xa8, 0x55, 0xf5, 0xb2, 0x12, 0xef, 0x4f, 0x61, 0xc1, 0x3c, 0x9c, 0xdb, 0x7b, 0x86, 0x26, 0xda, 0x7a, 0x87, 0x27, 0x60, 0xc0, 0x3d, 0x9d, 0xb3, 0x13, 0xee, 0x4e, 0x9, 0xa9, 0x54, 0xf4},
+ {0x0, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21, 0xc2, 0x63, 0x9d, 0x3c, 0x7c, 0xdd, 0x23, 0x82, 0xa3, 0x2, 0xfc, 0x5d, 0x1d, 0xbc, 0x42, 0xe3, 0x99, 0x38, 0xc6, 0x67, 0x27, 0x86, 0x78, 0xd9, 0xf8, 0x59, 0xa7, 0x6, 0x46, 0xe7, 0x19, 0xb8, 0x5b, 0xfa, 0x4, 0xa5, 0xe5, 0x44, 0xba, 0x1b, 0x3a, 0x9b, 0x65, 0xc4, 0x84, 0x25, 0xdb, 0x7a, 0x2f, 0x8e, 0x70, 0xd1, 0x91, 0x30, 0xce, 0x6f, 0x4e, 0xef, 0x11, 0xb0, 0xf0, 0x51, 0xaf, 0xe, 0xed, 0x4c, 0xb2, 0x13, 0x53, 0xf2, 0xc, 0xad, 0x8c, 0x2d, 0xd3, 0x72, 0x32, 0x93, 0x6d, 0xcc, 0xb6, 0x17, 0xe9, 0x48, 0x8, 0xa9, 0x57, 0xf6, 0xd7, 0x76, 0x88, 0x29, 0x69, 0xc8, 0x36, 0x97, 0x74, 0xd5, 0x2b, 0x8a, 0xca, 0x6b, 0x95, 0x34, 0x15, 0xb4, 0x4a, 0xeb, 0xab, 0xa, 0xf4, 0x55, 0x5e, 0xff, 0x1, 0xa0, 0xe0, 0x41, 0xbf, 0x1e, 0x3f, 0x9e, 0x60, 0xc1, 0x81, 0x20, 0xde, 0x7f, 0x9c, 0x3d, 0xc3, 0x62, 0x22, 0x83, 0x7d, 0xdc, 0xfd, 0x5c, 0xa2, 0x3, 0x43, 0xe2, 0x1c, 0xbd, 0xc7, 0x66, 0x98, 0x39, 0x79, 0xd8, 0x26, 0x87, 0xa6, 0x7, 0xf9, 0x58, 0x18, 0xb9, 0x47, 0xe6, 0x5, 0xa4, 0x5a, 0xfb, 0xbb, 0x1a, 0xe4, 0x45, 0x64, 0xc5, 0x3b, 0x9a, 0xda, 0x7b, 0x85, 0x24, 0x71, 0xd0, 0x2e, 0x8f, 0xcf, 0x6e, 0x90, 0x31, 0x10, 0xb1, 0x4f, 0xee, 0xae, 0xf, 0xf1, 0x50, 0xb3, 0x12, 0xec, 0x4d, 0xd, 0xac, 0x52, 0xf3, 0xd2, 0x73, 0x8d, 0x2c, 0x6c, 0xcd, 0x33, 0x92, 0xe8, 0x49, 0xb7, 0x16, 0x56, 0xf7, 0x9, 0xa8, 0x89, 0x28, 0xd6, 0x77, 0x37, 0x96, 0x68, 0xc9, 0x2a, 0x8b, 0x75, 0xd4, 0x94, 0x35, 0xcb, 0x6a, 0x4b, 0xea, 0x14, 0xb5, 0xf5, 0x54, 0xaa, 0xb},
+ {0x0, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30, 0xf2, 0x50, 0xab, 0x9, 0x40, 0xe2, 0x19, 0xbb, 0x8b, 0x29, 0xd2, 0x70, 0x39, 0x9b, 0x60, 0xc2, 0xf9, 0x5b, 0xa0, 0x2, 0x4b, 0xe9, 0x12, 0xb0, 0x80, 0x22, 0xd9, 0x7b, 0x32, 0x90, 0x6b, 0xc9, 0xb, 0xa9, 0x52, 0xf0, 0xb9, 0x1b, 0xe0, 0x42, 0x72, 0xd0, 0x2b, 0x89, 0xc0, 0x62, 0x99, 0x3b, 0xef, 0x4d, 0xb6, 0x14, 0x5d, 0xff, 0x4, 0xa6, 0x96, 0x34, 0xcf, 0x6d, 0x24, 0x86, 0x7d, 0xdf, 0x1d, 0xbf, 0x44, 0xe6, 0xaf, 0xd, 0xf6, 0x54, 0x64, 0xc6, 0x3d, 0x9f, 0xd6, 0x74, 0x8f, 0x2d, 0x16, 0xb4, 0x4f, 0xed, 0xa4, 0x6, 0xfd, 0x5f, 0x6f, 0xcd, 0x36, 0x94, 0xdd, 0x7f, 0x84, 0x26, 0xe4, 0x46, 0xbd, 0x1f, 0x56, 0xf4, 0xf, 0xad, 0x9d, 0x3f, 0xc4, 0x66, 0x2f, 0x8d, 0x76, 0xd4, 0xc3, 0x61, 0x9a, 0x38, 0x71, 0xd3, 0x28, 0x8a, 0xba, 0x18, 0xe3, 0x41, 0x8, 0xaa, 0x51, 0xf3, 0x31, 0x93, 0x68, 0xca, 0x83, 0x21, 0xda, 0x78, 0x48, 0xea, 0x11, 0xb3, 0xfa, 0x58, 0xa3, 0x1, 0x3a, 0x98, 0x63, 0xc1, 0x88, 0x2a, 0xd1, 0x73, 0x43, 0xe1, 0x1a, 0xb8, 0xf1, 0x53, 0xa8, 0xa, 0xc8, 0x6a, 0x91, 0x33, 0x7a, 0xd8, 0x23, 0x81, 0xb1, 0x13, 0xe8, 0x4a, 0x3, 0xa1, 0x5a, 0xf8, 0x2c, 0x8e, 0x75, 0xd7, 0x9e, 0x3c, 0xc7, 0x65, 0x55, 0xf7, 0xc, 0xae, 0xe7, 0x45, 0xbe, 0x1c, 0xde, 0x7c, 0x87, 0x25, 0x6c, 0xce, 0x35, 0x97, 0xa7, 0x5, 0xfe, 0x5c, 0x15, 0xb7, 0x4c, 0xee, 0xd5, 0x77, 0x8c, 0x2e, 0x67, 0xc5, 0x3e, 0x9c, 0xac, 0xe, 0xf5, 0x57, 0x1e, 0xbc, 0x47, 0xe5, 0x27, 0x85, 0x7e, 0xdc, 0x95, 0x37, 0xcc, 0x6e, 0x5e, 0xfc, 0x7, 0xa5, 0xec, 0x4e, 0xb5, 0x17},
+ {0x0, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f, 0xe2, 0x41, 0xb9, 0x1a, 0x54, 0xf7, 0xf, 0xac, 0x93, 0x30, 0xc8, 0x6b, 0x25, 0x86, 0x7e, 0xdd, 0xd9, 0x7a, 0x82, 0x21, 0x6f, 0xcc, 0x34, 0x97, 0xa8, 0xb, 0xf3, 0x50, 0x1e, 0xbd, 0x45, 0xe6, 0x3b, 0x98, 0x60, 0xc3, 0x8d, 0x2e, 0xd6, 0x75, 0x4a, 0xe9, 0x11, 0xb2, 0xfc, 0x5f, 0xa7, 0x4, 0xaf, 0xc, 0xf4, 0x57, 0x19, 0xba, 0x42, 0xe1, 0xde, 0x7d, 0x85, 0x26, 0x68, 0xcb, 0x33, 0x90, 0x4d, 0xee, 0x16, 0xb5, 0xfb, 0x58, 0xa0, 0x3, 0x3c, 0x9f, 0x67, 0xc4, 0x8a, 0x29, 0xd1, 0x72, 0x76, 0xd5, 0x2d, 0x8e, 0xc0, 0x63, 0x9b, 0x38, 0x7, 0xa4, 0x5c, 0xff, 0xb1, 0x12, 0xea, 0x49, 0x94, 0x37, 0xcf, 0x6c, 0x22, 0x81, 0x79, 0xda, 0xe5, 0x46, 0xbe, 0x1d, 0x53, 0xf0, 0x8, 0xab, 0x43, 0xe0, 0x18, 0xbb, 0xf5, 0x56, 0xae, 0xd, 0x32, 0x91, 0x69, 0xca, 0x84, 0x27, 0xdf, 0x7c, 0xa1, 0x2, 0xfa, 0x59, 0x17, 0xb4, 0x4c, 0xef, 0xd0, 0x73, 0x8b, 0x28, 0x66, 0xc5, 0x3d, 0x9e, 0x9a, 0x39, 0xc1, 0x62, 0x2c, 0x8f, 0x77, 0xd4, 0xeb, 0x48, 0xb0, 0x13, 0x5d, 0xfe, 0x6, 0xa5, 0x78, 0xdb, 0x23, 0x80, 0xce, 0x6d, 0x95, 0x36, 0x9, 0xaa, 0x52, 0xf1, 0xbf, 0x1c, 0xe4, 0x47, 0xec, 0x4f, 0xb7, 0x14, 0x5a, 0xf9, 0x1, 0xa2, 0x9d, 0x3e, 0xc6, 0x65, 0x2b, 0x88, 0x70, 0xd3, 0xe, 0xad, 0x55, 0xf6, 0xb8, 0x1b, 0xe3, 0x40, 0x7f, 0xdc, 0x24, 0x87, 0xc9, 0x6a, 0x92, 0x31, 0x35, 0x96, 0x6e, 0xcd, 0x83, 0x20, 0xd8, 0x7b, 0x44, 0xe7, 0x1f, 0xbc, 0xf2, 0x51, 0xa9, 0xa, 0xd7, 0x74, 0x8c, 0x2f, 0x61, 0xc2, 0x3a, 0x99, 0xa6, 0x5, 0xfd, 0x5e, 0x10, 0xb3, 0x4b, 0xe8},
+ {0x0, 0xa4, 0x55, 0xf1, 0xaa, 0xe, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12, 0x92, 0x36, 0xc7, 0x63, 0x38, 0x9c, 0x6d, 0xc9, 0xdb, 0x7f, 0x8e, 0x2a, 0x71, 0xd5, 0x24, 0x80, 0x39, 0x9d, 0x6c, 0xc8, 0x93, 0x37, 0xc6, 0x62, 0x70, 0xd4, 0x25, 0x81, 0xda, 0x7e, 0x8f, 0x2b, 0xab, 0xf, 0xfe, 0x5a, 0x1, 0xa5, 0x54, 0xf0, 0xe2, 0x46, 0xb7, 0x13, 0x48, 0xec, 0x1d, 0xb9, 0x72, 0xd6, 0x27, 0x83, 0xd8, 0x7c, 0x8d, 0x29, 0x3b, 0x9f, 0x6e, 0xca, 0x91, 0x35, 0xc4, 0x60, 0xe0, 0x44, 0xb5, 0x11, 0x4a, 0xee, 0x1f, 0xbb, 0xa9, 0xd, 0xfc, 0x58, 0x3, 0xa7, 0x56, 0xf2, 0x4b, 0xef, 0x1e, 0xba, 0xe1, 0x45, 0xb4, 0x10, 0x2, 0xa6, 0x57, 0xf3, 0xa8, 0xc, 0xfd, 0x59, 0xd9, 0x7d, 0x8c, 0x28, 0x73, 0xd7, 0x26, 0x82, 0x90, 0x34, 0xc5, 0x61, 0x3a, 0x9e, 0x6f, 0xcb, 0xe4, 0x40, 0xb1, 0x15, 0x4e, 0xea, 0x1b, 0xbf, 0xad, 0x9, 0xf8, 0x5c, 0x7, 0xa3, 0x52, 0xf6, 0x76, 0xd2, 0x23, 0x87, 0xdc, 0x78, 0x89, 0x2d, 0x3f, 0x9b, 0x6a, 0xce, 0x95, 0x31, 0xc0, 0x64, 0xdd, 0x79, 0x88, 0x2c, 0x77, 0xd3, 0x22, 0x86, 0x94, 0x30, 0xc1, 0x65, 0x3e, 0x9a, 0x6b, 0xcf, 0x4f, 0xeb, 0x1a, 0xbe, 0xe5, 0x41, 0xb0, 0x14, 0x6, 0xa2, 0x53, 0xf7, 0xac, 0x8, 0xf9, 0x5d, 0x96, 0x32, 0xc3, 0x67, 0x3c, 0x98, 0x69, 0xcd, 0xdf, 0x7b, 0x8a, 0x2e, 0x75, 0xd1, 0x20, 0x84, 0x4, 0xa0, 0x51, 0xf5, 0xae, 0xa, 0xfb, 0x5f, 0x4d, 0xe9, 0x18, 0xbc, 0xe7, 0x43, 0xb2, 0x16, 0xaf, 0xb, 0xfa, 0x5e, 0x5, 0xa1, 0x50, 0xf4, 0xe6, 0x42, 0xb3, 0x17, 0x4c, 0xe8, 0x19, 0xbd, 0x3d, 0x99, 0x68, 0xcc, 0x97, 0x33, 0xc2, 0x66, 0x74, 0xd0, 0x21, 0x85, 0xde, 0x7a, 0x8b, 0x2f},
+ {0x0, 0xa5, 0x57, 0xf2, 0xae, 0xb, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d, 0x82, 0x27, 0xd5, 0x70, 0x2c, 0x89, 0x7b, 0xde, 0xc3, 0x66, 0x94, 0x31, 0x6d, 0xc8, 0x3a, 0x9f, 0x19, 0xbc, 0x4e, 0xeb, 0xb7, 0x12, 0xe0, 0x45, 0x58, 0xfd, 0xf, 0xaa, 0xf6, 0x53, 0xa1, 0x4, 0x9b, 0x3e, 0xcc, 0x69, 0x35, 0x90, 0x62, 0xc7, 0xda, 0x7f, 0x8d, 0x28, 0x74, 0xd1, 0x23, 0x86, 0x32, 0x97, 0x65, 0xc0, 0x9c, 0x39, 0xcb, 0x6e, 0x73, 0xd6, 0x24, 0x81, 0xdd, 0x78, 0x8a, 0x2f, 0xb0, 0x15, 0xe7, 0x42, 0x1e, 0xbb, 0x49, 0xec, 0xf1, 0x54, 0xa6, 0x3, 0x5f, 0xfa, 0x8, 0xad, 0x2b, 0x8e, 0x7c, 0xd9, 0x85, 0x20, 0xd2, 0x77, 0x6a, 0xcf, 0x3d, 0x98, 0xc4, 0x61, 0x93, 0x36, 0xa9, 0xc, 0xfe, 0x5b, 0x7, 0xa2, 0x50, 0xf5, 0xe8, 0x4d, 0xbf, 0x1a, 0x46, 0xe3, 0x11, 0xb4, 0x64, 0xc1, 0x33, 0x96, 0xca, 0x6f, 0x9d, 0x38, 0x25, 0x80, 0x72, 0xd7, 0x8b, 0x2e, 0xdc, 0x79, 0xe6, 0x43, 0xb1, 0x14, 0x48, 0xed, 0x1f, 0xba, 0xa7, 0x2, 0xf0, 0x55, 0x9, 0xac, 0x5e, 0xfb, 0x7d, 0xd8, 0x2a, 0x8f, 0xd3, 0x76, 0x84, 0x21, 0x3c, 0x99, 0x6b, 0xce, 0x92, 0x37, 0xc5, 0x60, 0xff, 0x5a, 0xa8, 0xd, 0x51, 0xf4, 0x6, 0xa3, 0xbe, 0x1b, 0xe9, 0x4c, 0x10, 0xb5, 0x47, 0xe2, 0x56, 0xf3, 0x1, 0xa4, 0xf8, 0x5d, 0xaf, 0xa, 0x17, 0xb2, 0x40, 0xe5, 0xb9, 0x1c, 0xee, 0x4b, 0xd4, 0x71, 0x83, 0x26, 0x7a, 0xdf, 0x2d, 0x88, 0x95, 0x30, 0xc2, 0x67, 0x3b, 0x9e, 0x6c, 0xc9, 0x4f, 0xea, 0x18, 0xbd, 0xe1, 0x44, 0xb6, 0x13, 0xe, 0xab, 0x59, 0xfc, 0xa0, 0x5, 0xf7, 0x52, 0xcd, 0x68, 0x9a, 0x3f, 0x63, 0xc6, 0x34, 0x91, 0x8c, 0x29, 0xdb, 0x7e, 0x22, 0x87, 0x75, 0xd0},
+ {0x0, 0xa6, 0x51, 0xf7, 0xa2, 0x4, 0xf3, 0x55, 0x59, 0xff, 0x8, 0xae, 0xfb, 0x5d, 0xaa, 0xc, 0xb2, 0x14, 0xe3, 0x45, 0x10, 0xb6, 0x41, 0xe7, 0xeb, 0x4d, 0xba, 0x1c, 0x49, 0xef, 0x18, 0xbe, 0x79, 0xdf, 0x28, 0x8e, 0xdb, 0x7d, 0x8a, 0x2c, 0x20, 0x86, 0x71, 0xd7, 0x82, 0x24, 0xd3, 0x75, 0xcb, 0x6d, 0x9a, 0x3c, 0x69, 0xcf, 0x38, 0x9e, 0x92, 0x34, 0xc3, 0x65, 0x30, 0x96, 0x61, 0xc7, 0xf2, 0x54, 0xa3, 0x5, 0x50, 0xf6, 0x1, 0xa7, 0xab, 0xd, 0xfa, 0x5c, 0x9, 0xaf, 0x58, 0xfe, 0x40, 0xe6, 0x11, 0xb7, 0xe2, 0x44, 0xb3, 0x15, 0x19, 0xbf, 0x48, 0xee, 0xbb, 0x1d, 0xea, 0x4c, 0x8b, 0x2d, 0xda, 0x7c, 0x29, 0x8f, 0x78, 0xde, 0xd2, 0x74, 0x83, 0x25, 0x70, 0xd6, 0x21, 0x87, 0x39, 0x9f, 0x68, 0xce, 0x9b, 0x3d, 0xca, 0x6c, 0x60, 0xc6, 0x31, 0x97, 0xc2, 0x64, 0x93, 0x35, 0xf9, 0x5f, 0xa8, 0xe, 0x5b, 0xfd, 0xa, 0xac, 0xa0, 0x6, 0xf1, 0x57, 0x2, 0xa4, 0x53, 0xf5, 0x4b, 0xed, 0x1a, 0xbc, 0xe9, 0x4f, 0xb8, 0x1e, 0x12, 0xb4, 0x43, 0xe5, 0xb0, 0x16, 0xe1, 0x47, 0x80, 0x26, 0xd1, 0x77, 0x22, 0x84, 0x73, 0xd5, 0xd9, 0x7f, 0x88, 0x2e, 0x7b, 0xdd, 0x2a, 0x8c, 0x32, 0x94, 0x63, 0xc5, 0x90, 0x36, 0xc1, 0x67, 0x6b, 0xcd, 0x3a, 0x9c, 0xc9, 0x6f, 0x98, 0x3e, 0xb, 0xad, 0x5a, 0xfc, 0xa9, 0xf, 0xf8, 0x5e, 0x52, 0xf4, 0x3, 0xa5, 0xf0, 0x56, 0xa1, 0x7, 0xb9, 0x1f, 0xe8, 0x4e, 0x1b, 0xbd, 0x4a, 0xec, 0xe0, 0x46, 0xb1, 0x17, 0x42, 0xe4, 0x13, 0xb5, 0x72, 0xd4, 0x23, 0x85, 0xd0, 0x76, 0x81, 0x27, 0x2b, 0x8d, 0x7a, 0xdc, 0x89, 0x2f, 0xd8, 0x7e, 0xc0, 0x66, 0x91, 0x37, 0x62, 0xc4, 0x33, 0x95, 0x99, 0x3f, 0xc8, 0x6e, 0x3b, 0x9d, 0x6a, 0xcc},
+ {0x0, 0xa7, 0x53, 0xf4, 0xa6, 0x1, 0xf5, 0x52, 0x51, 0xf6, 0x2, 0xa5, 0xf7, 0x50, 0xa4, 0x3, 0xa2, 0x5, 0xf1, 0x56, 0x4, 0xa3, 0x57, 0xf0, 0xf3, 0x54, 0xa0, 0x7, 0x55, 0xf2, 0x6, 0xa1, 0x59, 0xfe, 0xa, 0xad, 0xff, 0x58, 0xac, 0xb, 0x8, 0xaf, 0x5b, 0xfc, 0xae, 0x9, 0xfd, 0x5a, 0xfb, 0x5c, 0xa8, 0xf, 0x5d, 0xfa, 0xe, 0xa9, 0xaa, 0xd, 0xf9, 0x5e, 0xc, 0xab, 0x5f, 0xf8, 0xb2, 0x15, 0xe1, 0x46, 0x14, 0xb3, 0x47, 0xe0, 0xe3, 0x44, 0xb0, 0x17, 0x45, 0xe2, 0x16, 0xb1, 0x10, 0xb7, 0x43, 0xe4, 0xb6, 0x11, 0xe5, 0x42, 0x41, 0xe6, 0x12, 0xb5, 0xe7, 0x40, 0xb4, 0x13, 0xeb, 0x4c, 0xb8, 0x1f, 0x4d, 0xea, 0x1e, 0xb9, 0xba, 0x1d, 0xe9, 0x4e, 0x1c, 0xbb, 0x4f, 0xe8, 0x49, 0xee, 0x1a, 0xbd, 0xef, 0x48, 0xbc, 0x1b, 0x18, 0xbf, 0x4b, 0xec, 0xbe, 0x19, 0xed, 0x4a, 0x79, 0xde, 0x2a, 0x8d, 0xdf, 0x78, 0x8c, 0x2b, 0x28, 0x8f, 0x7b, 0xdc, 0x8e, 0x29, 0xdd, 0x7a, 0xdb, 0x7c, 0x88, 0x2f, 0x7d, 0xda, 0x2e, 0x89, 0x8a, 0x2d, 0xd9, 0x7e, 0x2c, 0x8b, 0x7f, 0xd8, 0x20, 0x87, 0x73, 0xd4, 0x86, 0x21, 0xd5, 0x72, 0x71, 0xd6, 0x22, 0x85, 0xd7, 0x70, 0x84, 0x23, 0x82, 0x25, 0xd1, 0x76, 0x24, 0x83, 0x77, 0xd0, 0xd3, 0x74, 0x80, 0x27, 0x75, 0xd2, 0x26, 0x81, 0xcb, 0x6c, 0x98, 0x3f, 0x6d, 0xca, 0x3e, 0x99, 0x9a, 0x3d, 0xc9, 0x6e, 0x3c, 0x9b, 0x6f, 0xc8, 0x69, 0xce, 0x3a, 0x9d, 0xcf, 0x68, 0x9c, 0x3b, 0x38, 0x9f, 0x6b, 0xcc, 0x9e, 0x39, 0xcd, 0x6a, 0x92, 0x35, 0xc1, 0x66, 0x34, 0x93, 0x67, 0xc0, 0xc3, 0x64, 0x90, 0x37, 0x65, 0xc2, 0x36, 0x91, 0x30, 0x97, 0x63, 0xc4, 0x96, 0x31, 0xc5, 0x62, 0x61, 0xc6, 0x32, 0x95, 0xc7, 0x60, 0x94, 0x33},
+ {0x0, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56, 0x52, 0xfa, 0x1f, 0xb7, 0xc8, 0x60, 0x85, 0x2d, 0x7b, 0xd3, 0x36, 0x9e, 0xe1, 0x49, 0xac, 0x4, 0xa4, 0xc, 0xe9, 0x41, 0x3e, 0x96, 0x73, 0xdb, 0x8d, 0x25, 0xc0, 0x68, 0x17, 0xbf, 0x5a, 0xf2, 0xf6, 0x5e, 0xbb, 0x13, 0x6c, 0xc4, 0x21, 0x89, 0xdf, 0x77, 0x92, 0x3a, 0x45, 0xed, 0x8, 0xa0, 0x55, 0xfd, 0x18, 0xb0, 0xcf, 0x67, 0x82, 0x2a, 0x7c, 0xd4, 0x31, 0x99, 0xe6, 0x4e, 0xab, 0x3, 0x7, 0xaf, 0x4a, 0xe2, 0x9d, 0x35, 0xd0, 0x78, 0x2e, 0x86, 0x63, 0xcb, 0xb4, 0x1c, 0xf9, 0x51, 0xf1, 0x59, 0xbc, 0x14, 0x6b, 0xc3, 0x26, 0x8e, 0xd8, 0x70, 0x95, 0x3d, 0x42, 0xea, 0xf, 0xa7, 0xa3, 0xb, 0xee, 0x46, 0x39, 0x91, 0x74, 0xdc, 0x8a, 0x22, 0xc7, 0x6f, 0x10, 0xb8, 0x5d, 0xf5, 0xaa, 0x2, 0xe7, 0x4f, 0x30, 0x98, 0x7d, 0xd5, 0x83, 0x2b, 0xce, 0x66, 0x19, 0xb1, 0x54, 0xfc, 0xf8, 0x50, 0xb5, 0x1d, 0x62, 0xca, 0x2f, 0x87, 0xd1, 0x79, 0x9c, 0x34, 0x4b, 0xe3, 0x6, 0xae, 0xe, 0xa6, 0x43, 0xeb, 0x94, 0x3c, 0xd9, 0x71, 0x27, 0x8f, 0x6a, 0xc2, 0xbd, 0x15, 0xf0, 0x58, 0x5c, 0xf4, 0x11, 0xb9, 0xc6, 0x6e, 0x8b, 0x23, 0x75, 0xdd, 0x38, 0x90, 0xef, 0x47, 0xa2, 0xa, 0xff, 0x57, 0xb2, 0x1a, 0x65, 0xcd, 0x28, 0x80, 0xd6, 0x7e, 0x9b, 0x33, 0x4c, 0xe4, 0x1, 0xa9, 0xad, 0x5, 0xe0, 0x48, 0x37, 0x9f, 0x7a, 0xd2, 0x84, 0x2c, 0xc9, 0x61, 0x1e, 0xb6, 0x53, 0xfb, 0x5b, 0xf3, 0x16, 0xbe, 0xc1, 0x69, 0x8c, 0x24, 0x72, 0xda, 0x3f, 0x97, 0xe8, 0x40, 0xa5, 0xd, 0x9, 0xa1, 0x44, 0xec, 0x93, 0x3b, 0xde, 0x76, 0x20, 0x88, 0x6d, 0xc5, 0xba, 0x12, 0xf7, 0x5f},
+ {0x0, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59, 0x42, 0xeb, 0xd, 0xa4, 0xdc, 0x75, 0x93, 0x3a, 0x63, 0xca, 0x2c, 0x85, 0xfd, 0x54, 0xb2, 0x1b, 0x84, 0x2d, 0xcb, 0x62, 0x1a, 0xb3, 0x55, 0xfc, 0xa5, 0xc, 0xea, 0x43, 0x3b, 0x92, 0x74, 0xdd, 0xc6, 0x6f, 0x89, 0x20, 0x58, 0xf1, 0x17, 0xbe, 0xe7, 0x4e, 0xa8, 0x1, 0x79, 0xd0, 0x36, 0x9f, 0x15, 0xbc, 0x5a, 0xf3, 0x8b, 0x22, 0xc4, 0x6d, 0x34, 0x9d, 0x7b, 0xd2, 0xaa, 0x3, 0xe5, 0x4c, 0x57, 0xfe, 0x18, 0xb1, 0xc9, 0x60, 0x86, 0x2f, 0x76, 0xdf, 0x39, 0x90, 0xe8, 0x41, 0xa7, 0xe, 0x91, 0x38, 0xde, 0x77, 0xf, 0xa6, 0x40, 0xe9, 0xb0, 0x19, 0xff, 0x56, 0x2e, 0x87, 0x61, 0xc8, 0xd3, 0x7a, 0x9c, 0x35, 0x4d, 0xe4, 0x2, 0xab, 0xf2, 0x5b, 0xbd, 0x14, 0x6c, 0xc5, 0x23, 0x8a, 0x2a, 0x83, 0x65, 0xcc, 0xb4, 0x1d, 0xfb, 0x52, 0xb, 0xa2, 0x44, 0xed, 0x95, 0x3c, 0xda, 0x73, 0x68, 0xc1, 0x27, 0x8e, 0xf6, 0x5f, 0xb9, 0x10, 0x49, 0xe0, 0x6, 0xaf, 0xd7, 0x7e, 0x98, 0x31, 0xae, 0x7, 0xe1, 0x48, 0x30, 0x99, 0x7f, 0xd6, 0x8f, 0x26, 0xc0, 0x69, 0x11, 0xb8, 0x5e, 0xf7, 0xec, 0x45, 0xa3, 0xa, 0x72, 0xdb, 0x3d, 0x94, 0xcd, 0x64, 0x82, 0x2b, 0x53, 0xfa, 0x1c, 0xb5, 0x3f, 0x96, 0x70, 0xd9, 0xa1, 0x8, 0xee, 0x47, 0x1e, 0xb7, 0x51, 0xf8, 0x80, 0x29, 0xcf, 0x66, 0x7d, 0xd4, 0x32, 0x9b, 0xe3, 0x4a, 0xac, 0x5, 0x5c, 0xf5, 0x13, 0xba, 0xc2, 0x6b, 0x8d, 0x24, 0xbb, 0x12, 0xf4, 0x5d, 0x25, 0x8c, 0x6a, 0xc3, 0x9a, 0x33, 0xd5, 0x7c, 0x4, 0xad, 0x4b, 0xe2, 0xf9, 0x50, 0xb6, 0x1f, 0x67, 0xce, 0x28, 0x81, 0xd8, 0x71, 0x97, 0x3e, 0x46, 0xef, 0x9, 0xa0},
+ {0x0, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x1, 0xe2, 0x48, 0x72, 0xd8, 0x3b, 0x91, 0xe0, 0x4a, 0xa9, 0x3, 0x4b, 0xe1, 0x2, 0xa8, 0xd9, 0x73, 0x90, 0x3a, 0xe4, 0x4e, 0xad, 0x7, 0x76, 0xdc, 0x3f, 0x95, 0xdd, 0x77, 0x94, 0x3e, 0x4f, 0xe5, 0x6, 0xac, 0x96, 0x3c, 0xdf, 0x75, 0x4, 0xae, 0x4d, 0xe7, 0xaf, 0x5, 0xe6, 0x4c, 0x3d, 0x97, 0x74, 0xde, 0xd5, 0x7f, 0x9c, 0x36, 0x47, 0xed, 0xe, 0xa4, 0xec, 0x46, 0xa5, 0xf, 0x7e, 0xd4, 0x37, 0x9d, 0xa7, 0xd, 0xee, 0x44, 0x35, 0x9f, 0x7c, 0xd6, 0x9e, 0x34, 0xd7, 0x7d, 0xc, 0xa6, 0x45, 0xef, 0x31, 0x9b, 0x78, 0xd2, 0xa3, 0x9, 0xea, 0x40, 0x8, 0xa2, 0x41, 0xeb, 0x9a, 0x30, 0xd3, 0x79, 0x43, 0xe9, 0xa, 0xa0, 0xd1, 0x7b, 0x98, 0x32, 0x7a, 0xd0, 0x33, 0x99, 0xe8, 0x42, 0xa1, 0xb, 0xb7, 0x1d, 0xfe, 0x54, 0x25, 0x8f, 0x6c, 0xc6, 0x8e, 0x24, 0xc7, 0x6d, 0x1c, 0xb6, 0x55, 0xff, 0xc5, 0x6f, 0x8c, 0x26, 0x57, 0xfd, 0x1e, 0xb4, 0xfc, 0x56, 0xb5, 0x1f, 0x6e, 0xc4, 0x27, 0x8d, 0x53, 0xf9, 0x1a, 0xb0, 0xc1, 0x6b, 0x88, 0x22, 0x6a, 0xc0, 0x23, 0x89, 0xf8, 0x52, 0xb1, 0x1b, 0x21, 0x8b, 0x68, 0xc2, 0xb3, 0x19, 0xfa, 0x50, 0x18, 0xb2, 0x51, 0xfb, 0x8a, 0x20, 0xc3, 0x69, 0x62, 0xc8, 0x2b, 0x81, 0xf0, 0x5a, 0xb9, 0x13, 0x5b, 0xf1, 0x12, 0xb8, 0xc9, 0x63, 0x80, 0x2a, 0x10, 0xba, 0x59, 0xf3, 0x82, 0x28, 0xcb, 0x61, 0x29, 0x83, 0x60, 0xca, 0xbb, 0x11, 0xf2, 0x58, 0x86, 0x2c, 0xcf, 0x65, 0x14, 0xbe, 0x5d, 0xf7, 0xbf, 0x15, 0xf6, 0x5c, 0x2d, 0x87, 0x64, 0xce, 0xf4, 0x5e, 0xbd, 0x17, 0x66, 0xcc, 0x2f, 0x85, 0xcd, 0x67, 0x84, 0x2e, 0x5f, 0xf5, 0x16, 0xbc},
+ {0x0, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0xc, 0xec, 0x47, 0x62, 0xc9, 0x29, 0x82, 0xf4, 0x5f, 0xbf, 0x14, 0x53, 0xf8, 0x18, 0xb3, 0xc5, 0x6e, 0x8e, 0x25, 0xc4, 0x6f, 0x8f, 0x24, 0x52, 0xf9, 0x19, 0xb2, 0xf5, 0x5e, 0xbe, 0x15, 0x63, 0xc8, 0x28, 0x83, 0xa6, 0xd, 0xed, 0x46, 0x30, 0x9b, 0x7b, 0xd0, 0x97, 0x3c, 0xdc, 0x77, 0x1, 0xaa, 0x4a, 0xe1, 0x95, 0x3e, 0xde, 0x75, 0x3, 0xa8, 0x48, 0xe3, 0xa4, 0xf, 0xef, 0x44, 0x32, 0x99, 0x79, 0xd2, 0xf7, 0x5c, 0xbc, 0x17, 0x61, 0xca, 0x2a, 0x81, 0xc6, 0x6d, 0x8d, 0x26, 0x50, 0xfb, 0x1b, 0xb0, 0x51, 0xfa, 0x1a, 0xb1, 0xc7, 0x6c, 0x8c, 0x27, 0x60, 0xcb, 0x2b, 0x80, 0xf6, 0x5d, 0xbd, 0x16, 0x33, 0x98, 0x78, 0xd3, 0xa5, 0xe, 0xee, 0x45, 0x2, 0xa9, 0x49, 0xe2, 0x94, 0x3f, 0xdf, 0x74, 0x37, 0x9c, 0x7c, 0xd7, 0xa1, 0xa, 0xea, 0x41, 0x6, 0xad, 0x4d, 0xe6, 0x90, 0x3b, 0xdb, 0x70, 0x55, 0xfe, 0x1e, 0xb5, 0xc3, 0x68, 0x88, 0x23, 0x64, 0xcf, 0x2f, 0x84, 0xf2, 0x59, 0xb9, 0x12, 0xf3, 0x58, 0xb8, 0x13, 0x65, 0xce, 0x2e, 0x85, 0xc2, 0x69, 0x89, 0x22, 0x54, 0xff, 0x1f, 0xb4, 0x91, 0x3a, 0xda, 0x71, 0x7, 0xac, 0x4c, 0xe7, 0xa0, 0xb, 0xeb, 0x40, 0x36, 0x9d, 0x7d, 0xd6, 0xa2, 0x9, 0xe9, 0x42, 0x34, 0x9f, 0x7f, 0xd4, 0x93, 0x38, 0xd8, 0x73, 0x5, 0xae, 0x4e, 0xe5, 0xc0, 0x6b, 0x8b, 0x20, 0x56, 0xfd, 0x1d, 0xb6, 0xf1, 0x5a, 0xba, 0x11, 0x67, 0xcc, 0x2c, 0x87, 0x66, 0xcd, 0x2d, 0x86, 0xf0, 0x5b, 0xbb, 0x10, 0x57, 0xfc, 0x1c, 0xb7, 0xc1, 0x6a, 0x8a, 0x21, 0x4, 0xaf, 0x4f, 0xe4, 0x92, 0x39, 0xd9, 0x72, 0x35, 0x9e, 0x7e, 0xd5, 0xa3, 0x8, 0xe8, 0x43},
+ {0x0, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x9, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a, 0x12, 0xbe, 0x57, 0xfb, 0x98, 0x34, 0xdd, 0x71, 0x1b, 0xb7, 0x5e, 0xf2, 0x91, 0x3d, 0xd4, 0x78, 0x24, 0x88, 0x61, 0xcd, 0xae, 0x2, 0xeb, 0x47, 0x2d, 0x81, 0x68, 0xc4, 0xa7, 0xb, 0xe2, 0x4e, 0x36, 0x9a, 0x73, 0xdf, 0xbc, 0x10, 0xf9, 0x55, 0x3f, 0x93, 0x7a, 0xd6, 0xb5, 0x19, 0xf0, 0x5c, 0x48, 0xe4, 0xd, 0xa1, 0xc2, 0x6e, 0x87, 0x2b, 0x41, 0xed, 0x4, 0xa8, 0xcb, 0x67, 0x8e, 0x22, 0x5a, 0xf6, 0x1f, 0xb3, 0xd0, 0x7c, 0x95, 0x39, 0x53, 0xff, 0x16, 0xba, 0xd9, 0x75, 0x9c, 0x30, 0x6c, 0xc0, 0x29, 0x85, 0xe6, 0x4a, 0xa3, 0xf, 0x65, 0xc9, 0x20, 0x8c, 0xef, 0x43, 0xaa, 0x6, 0x7e, 0xd2, 0x3b, 0x97, 0xf4, 0x58, 0xb1, 0x1d, 0x77, 0xdb, 0x32, 0x9e, 0xfd, 0x51, 0xb8, 0x14, 0x90, 0x3c, 0xd5, 0x79, 0x1a, 0xb6, 0x5f, 0xf3, 0x99, 0x35, 0xdc, 0x70, 0x13, 0xbf, 0x56, 0xfa, 0x82, 0x2e, 0xc7, 0x6b, 0x8, 0xa4, 0x4d, 0xe1, 0x8b, 0x27, 0xce, 0x62, 0x1, 0xad, 0x44, 0xe8, 0xb4, 0x18, 0xf1, 0x5d, 0x3e, 0x92, 0x7b, 0xd7, 0xbd, 0x11, 0xf8, 0x54, 0x37, 0x9b, 0x72, 0xde, 0xa6, 0xa, 0xe3, 0x4f, 0x2c, 0x80, 0x69, 0xc5, 0xaf, 0x3, 0xea, 0x46, 0x25, 0x89, 0x60, 0xcc, 0xd8, 0x74, 0x9d, 0x31, 0x52, 0xfe, 0x17, 0xbb, 0xd1, 0x7d, 0x94, 0x38, 0x5b, 0xf7, 0x1e, 0xb2, 0xca, 0x66, 0x8f, 0x23, 0x40, 0xec, 0x5, 0xa9, 0xc3, 0x6f, 0x86, 0x2a, 0x49, 0xe5, 0xc, 0xa0, 0xfc, 0x50, 0xb9, 0x15, 0x76, 0xda, 0x33, 0x9f, 0xf5, 0x59, 0xb0, 0x1c, 0x7f, 0xd3, 0x3a, 0x96, 0xee, 0x42, 0xab, 0x7, 0x64, 0xc8, 0x21, 0x8d, 0xe7, 0x4b, 0xa2, 0xe, 0x6d, 0xc1, 0x28, 0x84},
+ {0x0, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x1, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65, 0x2, 0xaf, 0x45, 0xe8, 0x8c, 0x21, 0xcb, 0x66, 0x3, 0xae, 0x44, 0xe9, 0x8d, 0x20, 0xca, 0x67, 0x4, 0xa9, 0x43, 0xee, 0x8a, 0x27, 0xcd, 0x60, 0x5, 0xa8, 0x42, 0xef, 0x8b, 0x26, 0xcc, 0x61, 0x6, 0xab, 0x41, 0xec, 0x88, 0x25, 0xcf, 0x62, 0x7, 0xaa, 0x40, 0xed, 0x89, 0x24, 0xce, 0x63, 0x8, 0xa5, 0x4f, 0xe2, 0x86, 0x2b, 0xc1, 0x6c, 0x9, 0xa4, 0x4e, 0xe3, 0x87, 0x2a, 0xc0, 0x6d, 0xa, 0xa7, 0x4d, 0xe0, 0x84, 0x29, 0xc3, 0x6e, 0xb, 0xa6, 0x4c, 0xe1, 0x85, 0x28, 0xc2, 0x6f, 0xc, 0xa1, 0x4b, 0xe6, 0x82, 0x2f, 0xc5, 0x68, 0xd, 0xa0, 0x4a, 0xe7, 0x83, 0x2e, 0xc4, 0x69, 0xe, 0xa3, 0x49, 0xe4, 0x80, 0x2d, 0xc7, 0x6a, 0xf, 0xa2, 0x48, 0xe5, 0x81, 0x2c, 0xc6, 0x6b, 0x10, 0xbd, 0x57, 0xfa, 0x9e, 0x33, 0xd9, 0x74, 0x11, 0xbc, 0x56, 0xfb, 0x9f, 0x32, 0xd8, 0x75, 0x12, 0xbf, 0x55, 0xf8, 0x9c, 0x31, 0xdb, 0x76, 0x13, 0xbe, 0x54, 0xf9, 0x9d, 0x30, 0xda, 0x77, 0x14, 0xb9, 0x53, 0xfe, 0x9a, 0x37, 0xdd, 0x70, 0x15, 0xb8, 0x52, 0xff, 0x9b, 0x36, 0xdc, 0x71, 0x16, 0xbb, 0x51, 0xfc, 0x98, 0x35, 0xdf, 0x72, 0x17, 0xba, 0x50, 0xfd, 0x99, 0x34, 0xde, 0x73, 0x18, 0xb5, 0x5f, 0xf2, 0x96, 0x3b, 0xd1, 0x7c, 0x19, 0xb4, 0x5e, 0xf3, 0x97, 0x3a, 0xd0, 0x7d, 0x1a, 0xb7, 0x5d, 0xf0, 0x94, 0x39, 0xd3, 0x7e, 0x1b, 0xb6, 0x5c, 0xf1, 0x95, 0x38, 0xd2, 0x7f, 0x1c, 0xb1, 0x5b, 0xf6, 0x92, 0x3f, 0xd5, 0x78, 0x1d, 0xb0, 0x5a, 0xf7, 0x93, 0x3e, 0xd4, 0x79, 0x1e, 0xb3, 0x59, 0xf4, 0x90, 0x3d, 0xd7, 0x7a, 0x1f, 0xb2, 0x58, 0xf5, 0x91, 0x3c, 0xd6, 0x7b},
+ {0x0, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74, 0x32, 0x9c, 0x73, 0xdd, 0xb0, 0x1e, 0xf1, 0x5f, 0x2b, 0x85, 0x6a, 0xc4, 0xa9, 0x7, 0xe8, 0x46, 0x64, 0xca, 0x25, 0x8b, 0xe6, 0x48, 0xa7, 0x9, 0x7d, 0xd3, 0x3c, 0x92, 0xff, 0x51, 0xbe, 0x10, 0x56, 0xf8, 0x17, 0xb9, 0xd4, 0x7a, 0x95, 0x3b, 0x4f, 0xe1, 0xe, 0xa0, 0xcd, 0x63, 0x8c, 0x22, 0xc8, 0x66, 0x89, 0x27, 0x4a, 0xe4, 0xb, 0xa5, 0xd1, 0x7f, 0x90, 0x3e, 0x53, 0xfd, 0x12, 0xbc, 0xfa, 0x54, 0xbb, 0x15, 0x78, 0xd6, 0x39, 0x97, 0xe3, 0x4d, 0xa2, 0xc, 0x61, 0xcf, 0x20, 0x8e, 0xac, 0x2, 0xed, 0x43, 0x2e, 0x80, 0x6f, 0xc1, 0xb5, 0x1b, 0xf4, 0x5a, 0x37, 0x99, 0x76, 0xd8, 0x9e, 0x30, 0xdf, 0x71, 0x1c, 0xb2, 0x5d, 0xf3, 0x87, 0x29, 0xc6, 0x68, 0x5, 0xab, 0x44, 0xea, 0x8d, 0x23, 0xcc, 0x62, 0xf, 0xa1, 0x4e, 0xe0, 0x94, 0x3a, 0xd5, 0x7b, 0x16, 0xb8, 0x57, 0xf9, 0xbf, 0x11, 0xfe, 0x50, 0x3d, 0x93, 0x7c, 0xd2, 0xa6, 0x8, 0xe7, 0x49, 0x24, 0x8a, 0x65, 0xcb, 0xe9, 0x47, 0xa8, 0x6, 0x6b, 0xc5, 0x2a, 0x84, 0xf0, 0x5e, 0xb1, 0x1f, 0x72, 0xdc, 0x33, 0x9d, 0xdb, 0x75, 0x9a, 0x34, 0x59, 0xf7, 0x18, 0xb6, 0xc2, 0x6c, 0x83, 0x2d, 0x40, 0xee, 0x1, 0xaf, 0x45, 0xeb, 0x4, 0xaa, 0xc7, 0x69, 0x86, 0x28, 0x5c, 0xf2, 0x1d, 0xb3, 0xde, 0x70, 0x9f, 0x31, 0x77, 0xd9, 0x36, 0x98, 0xf5, 0x5b, 0xb4, 0x1a, 0x6e, 0xc0, 0x2f, 0x81, 0xec, 0x42, 0xad, 0x3, 0x21, 0x8f, 0x60, 0xce, 0xa3, 0xd, 0xe2, 0x4c, 0x38, 0x96, 0x79, 0xd7, 0xba, 0x14, 0xfb, 0x55, 0x13, 0xbd, 0x52, 0xfc, 0x91, 0x3f, 0xd0, 0x7e, 0xa, 0xa4, 0x4b, 0xe5, 0x88, 0x26, 0xc9, 0x67},
+ {0x0, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b, 0x22, 0x8d, 0x61, 0xce, 0xa4, 0xb, 0xe7, 0x48, 0x33, 0x9c, 0x70, 0xdf, 0xb5, 0x1a, 0xf6, 0x59, 0x44, 0xeb, 0x7, 0xa8, 0xc2, 0x6d, 0x81, 0x2e, 0x55, 0xfa, 0x16, 0xb9, 0xd3, 0x7c, 0x90, 0x3f, 0x66, 0xc9, 0x25, 0x8a, 0xe0, 0x4f, 0xa3, 0xc, 0x77, 0xd8, 0x34, 0x9b, 0xf1, 0x5e, 0xb2, 0x1d, 0x88, 0x27, 0xcb, 0x64, 0xe, 0xa1, 0x4d, 0xe2, 0x99, 0x36, 0xda, 0x75, 0x1f, 0xb0, 0x5c, 0xf3, 0xaa, 0x5, 0xe9, 0x46, 0x2c, 0x83, 0x6f, 0xc0, 0xbb, 0x14, 0xf8, 0x57, 0x3d, 0x92, 0x7e, 0xd1, 0xcc, 0x63, 0x8f, 0x20, 0x4a, 0xe5, 0x9, 0xa6, 0xdd, 0x72, 0x9e, 0x31, 0x5b, 0xf4, 0x18, 0xb7, 0xee, 0x41, 0xad, 0x2, 0x68, 0xc7, 0x2b, 0x84, 0xff, 0x50, 0xbc, 0x13, 0x79, 0xd6, 0x3a, 0x95, 0xd, 0xa2, 0x4e, 0xe1, 0x8b, 0x24, 0xc8, 0x67, 0x1c, 0xb3, 0x5f, 0xf0, 0x9a, 0x35, 0xd9, 0x76, 0x2f, 0x80, 0x6c, 0xc3, 0xa9, 0x6, 0xea, 0x45, 0x3e, 0x91, 0x7d, 0xd2, 0xb8, 0x17, 0xfb, 0x54, 0x49, 0xe6, 0xa, 0xa5, 0xcf, 0x60, 0x8c, 0x23, 0x58, 0xf7, 0x1b, 0xb4, 0xde, 0x71, 0x9d, 0x32, 0x6b, 0xc4, 0x28, 0x87, 0xed, 0x42, 0xae, 0x1, 0x7a, 0xd5, 0x39, 0x96, 0xfc, 0x53, 0xbf, 0x10, 0x85, 0x2a, 0xc6, 0x69, 0x3, 0xac, 0x40, 0xef, 0x94, 0x3b, 0xd7, 0x78, 0x12, 0xbd, 0x51, 0xfe, 0xa7, 0x8, 0xe4, 0x4b, 0x21, 0x8e, 0x62, 0xcd, 0xb6, 0x19, 0xf5, 0x5a, 0x30, 0x9f, 0x73, 0xdc, 0xc1, 0x6e, 0x82, 0x2d, 0x47, 0xe8, 0x4, 0xab, 0xd0, 0x7f, 0x93, 0x3c, 0x56, 0xf9, 0x15, 0xba, 0xe3, 0x4c, 0xa0, 0xf, 0x65, 0xca, 0x26, 0x89, 0xf2, 0x5d, 0xb1, 0x1e, 0x74, 0xdb, 0x37, 0x98},
+ {0x0, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde, 0xcf, 0x7f, 0xb2, 0x2, 0x35, 0x85, 0x48, 0xf8, 0x26, 0x96, 0x5b, 0xeb, 0xdc, 0x6c, 0xa1, 0x11, 0x83, 0x33, 0xfe, 0x4e, 0x79, 0xc9, 0x4, 0xb4, 0x6a, 0xda, 0x17, 0xa7, 0x90, 0x20, 0xed, 0x5d, 0x4c, 0xfc, 0x31, 0x81, 0xb6, 0x6, 0xcb, 0x7b, 0xa5, 0x15, 0xd8, 0x68, 0x5f, 0xef, 0x22, 0x92, 0x1b, 0xab, 0x66, 0xd6, 0xe1, 0x51, 0x9c, 0x2c, 0xf2, 0x42, 0x8f, 0x3f, 0x8, 0xb8, 0x75, 0xc5, 0xd4, 0x64, 0xa9, 0x19, 0x2e, 0x9e, 0x53, 0xe3, 0x3d, 0x8d, 0x40, 0xf0, 0xc7, 0x77, 0xba, 0xa, 0x98, 0x28, 0xe5, 0x55, 0x62, 0xd2, 0x1f, 0xaf, 0x71, 0xc1, 0xc, 0xbc, 0x8b, 0x3b, 0xf6, 0x46, 0x57, 0xe7, 0x2a, 0x9a, 0xad, 0x1d, 0xd0, 0x60, 0xbe, 0xe, 0xc3, 0x73, 0x44, 0xf4, 0x39, 0x89, 0x36, 0x86, 0x4b, 0xfb, 0xcc, 0x7c, 0xb1, 0x1, 0xdf, 0x6f, 0xa2, 0x12, 0x25, 0x95, 0x58, 0xe8, 0xf9, 0x49, 0x84, 0x34, 0x3, 0xb3, 0x7e, 0xce, 0x10, 0xa0, 0x6d, 0xdd, 0xea, 0x5a, 0x97, 0x27, 0xb5, 0x5, 0xc8, 0x78, 0x4f, 0xff, 0x32, 0x82, 0x5c, 0xec, 0x21, 0x91, 0xa6, 0x16, 0xdb, 0x6b, 0x7a, 0xca, 0x7, 0xb7, 0x80, 0x30, 0xfd, 0x4d, 0x93, 0x23, 0xee, 0x5e, 0x69, 0xd9, 0x14, 0xa4, 0x2d, 0x9d, 0x50, 0xe0, 0xd7, 0x67, 0xaa, 0x1a, 0xc4, 0x74, 0xb9, 0x9, 0x3e, 0x8e, 0x43, 0xf3, 0xe2, 0x52, 0x9f, 0x2f, 0x18, 0xa8, 0x65, 0xd5, 0xb, 0xbb, 0x76, 0xc6, 0xf1, 0x41, 0x8c, 0x3c, 0xae, 0x1e, 0xd3, 0x63, 0x54, 0xe4, 0x29, 0x99, 0x47, 0xf7, 0x3a, 0x8a, 0xbd, 0xd, 0xc0, 0x70, 0x61, 0xd1, 0x1c, 0xac, 0x9b, 0x2b, 0xe6, 0x56, 0x88, 0x38, 0xf5, 0x45, 0x72, 0xc2, 0xf, 0xbf},
+ {0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1, 0xdf, 0x6e, 0xa0, 0x11, 0x21, 0x90, 0x5e, 0xef, 0x3e, 0x8f, 0x41, 0xf0, 0xc0, 0x71, 0xbf, 0xe, 0xa3, 0x12, 0xdc, 0x6d, 0x5d, 0xec, 0x22, 0x93, 0x42, 0xf3, 0x3d, 0x8c, 0xbc, 0xd, 0xc3, 0x72, 0x7c, 0xcd, 0x3, 0xb2, 0x82, 0x33, 0xfd, 0x4c, 0x9d, 0x2c, 0xe2, 0x53, 0x63, 0xd2, 0x1c, 0xad, 0x5b, 0xea, 0x24, 0x95, 0xa5, 0x14, 0xda, 0x6b, 0xba, 0xb, 0xc5, 0x74, 0x44, 0xf5, 0x3b, 0x8a, 0x84, 0x35, 0xfb, 0x4a, 0x7a, 0xcb, 0x5, 0xb4, 0x65, 0xd4, 0x1a, 0xab, 0x9b, 0x2a, 0xe4, 0x55, 0xf8, 0x49, 0x87, 0x36, 0x6, 0xb7, 0x79, 0xc8, 0x19, 0xa8, 0x66, 0xd7, 0xe7, 0x56, 0x98, 0x29, 0x27, 0x96, 0x58, 0xe9, 0xd9, 0x68, 0xa6, 0x17, 0xc6, 0x77, 0xb9, 0x8, 0x38, 0x89, 0x47, 0xf6, 0xb6, 0x7, 0xc9, 0x78, 0x48, 0xf9, 0x37, 0x86, 0x57, 0xe6, 0x28, 0x99, 0xa9, 0x18, 0xd6, 0x67, 0x69, 0xd8, 0x16, 0xa7, 0x97, 0x26, 0xe8, 0x59, 0x88, 0x39, 0xf7, 0x46, 0x76, 0xc7, 0x9, 0xb8, 0x15, 0xa4, 0x6a, 0xdb, 0xeb, 0x5a, 0x94, 0x25, 0xf4, 0x45, 0x8b, 0x3a, 0xa, 0xbb, 0x75, 0xc4, 0xca, 0x7b, 0xb5, 0x4, 0x34, 0x85, 0x4b, 0xfa, 0x2b, 0x9a, 0x54, 0xe5, 0xd5, 0x64, 0xaa, 0x1b, 0xed, 0x5c, 0x92, 0x23, 0x13, 0xa2, 0x6c, 0xdd, 0xc, 0xbd, 0x73, 0xc2, 0xf2, 0x43, 0x8d, 0x3c, 0x32, 0x83, 0x4d, 0xfc, 0xcc, 0x7d, 0xb3, 0x2, 0xd3, 0x62, 0xac, 0x1d, 0x2d, 0x9c, 0x52, 0xe3, 0x4e, 0xff, 0x31, 0x80, 0xb0, 0x1, 0xcf, 0x7e, 0xaf, 0x1e, 0xd0, 0x61, 0x51, 0xe0, 0x2e, 0x9f, 0x91, 0x20, 0xee, 0x5f, 0x6f, 0xde, 0x10, 0xa1, 0x70, 0xc1, 0xf, 0xbe, 0x8e, 0x3f, 0xf1, 0x40},
+ {0x0, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0xb, 0xb9, 0x72, 0xc0, 0xef, 0x5d, 0x96, 0x24, 0x1d, 0xaf, 0x64, 0xd6, 0x16, 0xa4, 0x6f, 0xdd, 0xe4, 0x56, 0x9d, 0x2f, 0xc3, 0x71, 0xba, 0x8, 0x31, 0x83, 0x48, 0xfa, 0x3a, 0x88, 0x43, 0xf1, 0xc8, 0x7a, 0xb1, 0x3, 0x2c, 0x9e, 0x55, 0xe7, 0xde, 0x6c, 0xa7, 0x15, 0xd5, 0x67, 0xac, 0x1e, 0x27, 0x95, 0x5e, 0xec, 0x9b, 0x29, 0xe2, 0x50, 0x69, 0xdb, 0x10, 0xa2, 0x62, 0xd0, 0x1b, 0xa9, 0x90, 0x22, 0xe9, 0x5b, 0x74, 0xc6, 0xd, 0xbf, 0x86, 0x34, 0xff, 0x4d, 0x8d, 0x3f, 0xf4, 0x46, 0x7f, 0xcd, 0x6, 0xb4, 0x58, 0xea, 0x21, 0x93, 0xaa, 0x18, 0xd3, 0x61, 0xa1, 0x13, 0xd8, 0x6a, 0x53, 0xe1, 0x2a, 0x98, 0xb7, 0x5, 0xce, 0x7c, 0x45, 0xf7, 0x3c, 0x8e, 0x4e, 0xfc, 0x37, 0x85, 0xbc, 0xe, 0xc5, 0x77, 0x2b, 0x99, 0x52, 0xe0, 0xd9, 0x6b, 0xa0, 0x12, 0xd2, 0x60, 0xab, 0x19, 0x20, 0x92, 0x59, 0xeb, 0xc4, 0x76, 0xbd, 0xf, 0x36, 0x84, 0x4f, 0xfd, 0x3d, 0x8f, 0x44, 0xf6, 0xcf, 0x7d, 0xb6, 0x4, 0xe8, 0x5a, 0x91, 0x23, 0x1a, 0xa8, 0x63, 0xd1, 0x11, 0xa3, 0x68, 0xda, 0xe3, 0x51, 0x9a, 0x28, 0x7, 0xb5, 0x7e, 0xcc, 0xf5, 0x47, 0x8c, 0x3e, 0xfe, 0x4c, 0x87, 0x35, 0xc, 0xbe, 0x75, 0xc7, 0xb0, 0x2, 0xc9, 0x7b, 0x42, 0xf0, 0x3b, 0x89, 0x49, 0xfb, 0x30, 0x82, 0xbb, 0x9, 0xc2, 0x70, 0x5f, 0xed, 0x26, 0x94, 0xad, 0x1f, 0xd4, 0x66, 0xa6, 0x14, 0xdf, 0x6d, 0x54, 0xe6, 0x2d, 0x9f, 0x73, 0xc1, 0xa, 0xb8, 0x81, 0x33, 0xf8, 0x4a, 0x8a, 0x38, 0xf3, 0x41, 0x78, 0xca, 0x1, 0xb3, 0x9c, 0x2e, 0xe5, 0x57, 0x6e, 0xdc, 0x17, 0xa5, 0x65, 0xd7, 0x1c, 0xae, 0x97, 0x25, 0xee, 0x5c},
+ {0x0, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x7, 0xb4, 0x7c, 0xcf, 0xff, 0x4c, 0x84, 0x37, 0x9, 0xba, 0x72, 0xc1, 0xe, 0xbd, 0x75, 0xc6, 0xf8, 0x4b, 0x83, 0x30, 0xe3, 0x50, 0x98, 0x2b, 0x15, 0xa6, 0x6e, 0xdd, 0x12, 0xa1, 0x69, 0xda, 0xe4, 0x57, 0x9f, 0x2c, 0x1c, 0xaf, 0x67, 0xd4, 0xea, 0x59, 0x91, 0x22, 0xed, 0x5e, 0x96, 0x25, 0x1b, 0xa8, 0x60, 0xd3, 0xdb, 0x68, 0xa0, 0x13, 0x2d, 0x9e, 0x56, 0xe5, 0x2a, 0x99, 0x51, 0xe2, 0xdc, 0x6f, 0xa7, 0x14, 0x24, 0x97, 0x5f, 0xec, 0xd2, 0x61, 0xa9, 0x1a, 0xd5, 0x66, 0xae, 0x1d, 0x23, 0x90, 0x58, 0xeb, 0x38, 0x8b, 0x43, 0xf0, 0xce, 0x7d, 0xb5, 0x6, 0xc9, 0x7a, 0xb2, 0x1, 0x3f, 0x8c, 0x44, 0xf7, 0xc7, 0x74, 0xbc, 0xf, 0x31, 0x82, 0x4a, 0xf9, 0x36, 0x85, 0x4d, 0xfe, 0xc0, 0x73, 0xbb, 0x8, 0xab, 0x18, 0xd0, 0x63, 0x5d, 0xee, 0x26, 0x95, 0x5a, 0xe9, 0x21, 0x92, 0xac, 0x1f, 0xd7, 0x64, 0x54, 0xe7, 0x2f, 0x9c, 0xa2, 0x11, 0xd9, 0x6a, 0xa5, 0x16, 0xde, 0x6d, 0x53, 0xe0, 0x28, 0x9b, 0x48, 0xfb, 0x33, 0x80, 0xbe, 0xd, 0xc5, 0x76, 0xb9, 0xa, 0xc2, 0x71, 0x4f, 0xfc, 0x34, 0x87, 0xb7, 0x4, 0xcc, 0x7f, 0x41, 0xf2, 0x3a, 0x89, 0x46, 0xf5, 0x3d, 0x8e, 0xb0, 0x3, 0xcb, 0x78, 0x70, 0xc3, 0xb, 0xb8, 0x86, 0x35, 0xfd, 0x4e, 0x81, 0x32, 0xfa, 0x49, 0x77, 0xc4, 0xc, 0xbf, 0x8f, 0x3c, 0xf4, 0x47, 0x79, 0xca, 0x2, 0xb1, 0x7e, 0xcd, 0x5, 0xb6, 0x88, 0x3b, 0xf3, 0x40, 0x93, 0x20, 0xe8, 0x5b, 0x65, 0xd6, 0x1e, 0xad, 0x62, 0xd1, 0x19, 0xaa, 0x94, 0x27, 0xef, 0x5c, 0x6c, 0xdf, 0x17, 0xa4, 0x9a, 0x29, 0xe1, 0x52, 0x9d, 0x2e, 0xe6, 0x55, 0x6b, 0xd8, 0x10, 0xa3},
+ {0x0, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x8, 0x23, 0x97, 0x56, 0xe2, 0x8f, 0x3b, 0xfa, 0x4e, 0x65, 0xd1, 0x10, 0xa4, 0x46, 0xf2, 0x33, 0x87, 0xac, 0x18, 0xd9, 0x6d, 0x3, 0xb7, 0x76, 0xc2, 0xe9, 0x5d, 0x9c, 0x28, 0xca, 0x7e, 0xbf, 0xb, 0x20, 0x94, 0x55, 0xe1, 0x8c, 0x38, 0xf9, 0x4d, 0x66, 0xd2, 0x13, 0xa7, 0x45, 0xf1, 0x30, 0x84, 0xaf, 0x1b, 0xda, 0x6e, 0x6, 0xb2, 0x73, 0xc7, 0xec, 0x58, 0x99, 0x2d, 0xcf, 0x7b, 0xba, 0xe, 0x25, 0x91, 0x50, 0xe4, 0x89, 0x3d, 0xfc, 0x48, 0x63, 0xd7, 0x16, 0xa2, 0x40, 0xf4, 0x35, 0x81, 0xaa, 0x1e, 0xdf, 0x6b, 0x5, 0xb1, 0x70, 0xc4, 0xef, 0x5b, 0x9a, 0x2e, 0xcc, 0x78, 0xb9, 0xd, 0x26, 0x92, 0x53, 0xe7, 0x8a, 0x3e, 0xff, 0x4b, 0x60, 0xd4, 0x15, 0xa1, 0x43, 0xf7, 0x36, 0x82, 0xa9, 0x1d, 0xdc, 0x68, 0xc, 0xb8, 0x79, 0xcd, 0xe6, 0x52, 0x93, 0x27, 0xc5, 0x71, 0xb0, 0x4, 0x2f, 0x9b, 0x5a, 0xee, 0x83, 0x37, 0xf6, 0x42, 0x69, 0xdd, 0x1c, 0xa8, 0x4a, 0xfe, 0x3f, 0x8b, 0xa0, 0x14, 0xd5, 0x61, 0xf, 0xbb, 0x7a, 0xce, 0xe5, 0x51, 0x90, 0x24, 0xc6, 0x72, 0xb3, 0x7, 0x2c, 0x98, 0x59, 0xed, 0x80, 0x34, 0xf5, 0x41, 0x6a, 0xde, 0x1f, 0xab, 0x49, 0xfd, 0x3c, 0x88, 0xa3, 0x17, 0xd6, 0x62, 0xa, 0xbe, 0x7f, 0xcb, 0xe0, 0x54, 0x95, 0x21, 0xc3, 0x77, 0xb6, 0x2, 0x29, 0x9d, 0x5c, 0xe8, 0x85, 0x31, 0xf0, 0x44, 0x6f, 0xdb, 0x1a, 0xae, 0x4c, 0xf8, 0x39, 0x8d, 0xa6, 0x12, 0xd3, 0x67, 0x9, 0xbd, 0x7c, 0xc8, 0xe3, 0x57, 0x96, 0x22, 0xc0, 0x74, 0xb5, 0x1, 0x2a, 0x9e, 0x5f, 0xeb, 0x86, 0x32, 0xf3, 0x47, 0x6c, 0xd8, 0x19, 0xad, 0x4f, 0xfb, 0x3a, 0x8e, 0xa5, 0x11, 0xd0, 0x64},
+ {0x0, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x3, 0x2f, 0x9a, 0x58, 0xed, 0x9f, 0x2a, 0xe8, 0x5d, 0x71, 0xc4, 0x6, 0xb3, 0x5e, 0xeb, 0x29, 0x9c, 0xb0, 0x5, 0xc7, 0x72, 0x23, 0x96, 0x54, 0xe1, 0xcd, 0x78, 0xba, 0xf, 0xe2, 0x57, 0x95, 0x20, 0xc, 0xb9, 0x7b, 0xce, 0xbc, 0x9, 0xcb, 0x7e, 0x52, 0xe7, 0x25, 0x90, 0x7d, 0xc8, 0xa, 0xbf, 0x93, 0x26, 0xe4, 0x51, 0x46, 0xf3, 0x31, 0x84, 0xa8, 0x1d, 0xdf, 0x6a, 0x87, 0x32, 0xf0, 0x45, 0x69, 0xdc, 0x1e, 0xab, 0xd9, 0x6c, 0xae, 0x1b, 0x37, 0x82, 0x40, 0xf5, 0x18, 0xad, 0x6f, 0xda, 0xf6, 0x43, 0x81, 0x34, 0x65, 0xd0, 0x12, 0xa7, 0x8b, 0x3e, 0xfc, 0x49, 0xa4, 0x11, 0xd3, 0x66, 0x4a, 0xff, 0x3d, 0x88, 0xfa, 0x4f, 0x8d, 0x38, 0x14, 0xa1, 0x63, 0xd6, 0x3b, 0x8e, 0x4c, 0xf9, 0xd5, 0x60, 0xa2, 0x17, 0x8c, 0x39, 0xfb, 0x4e, 0x62, 0xd7, 0x15, 0xa0, 0x4d, 0xf8, 0x3a, 0x8f, 0xa3, 0x16, 0xd4, 0x61, 0x13, 0xa6, 0x64, 0xd1, 0xfd, 0x48, 0x8a, 0x3f, 0xd2, 0x67, 0xa5, 0x10, 0x3c, 0x89, 0x4b, 0xfe, 0xaf, 0x1a, 0xd8, 0x6d, 0x41, 0xf4, 0x36, 0x83, 0x6e, 0xdb, 0x19, 0xac, 0x80, 0x35, 0xf7, 0x42, 0x30, 0x85, 0x47, 0xf2, 0xde, 0x6b, 0xa9, 0x1c, 0xf1, 0x44, 0x86, 0x33, 0x1f, 0xaa, 0x68, 0xdd, 0xca, 0x7f, 0xbd, 0x8, 0x24, 0x91, 0x53, 0xe6, 0xb, 0xbe, 0x7c, 0xc9, 0xe5, 0x50, 0x92, 0x27, 0x55, 0xe0, 0x22, 0x97, 0xbb, 0xe, 0xcc, 0x79, 0x94, 0x21, 0xe3, 0x56, 0x7a, 0xcf, 0xd, 0xb8, 0xe9, 0x5c, 0x9e, 0x2b, 0x7, 0xb2, 0x70, 0xc5, 0x28, 0x9d, 0x5f, 0xea, 0xc6, 0x73, 0xb1, 0x4, 0x76, 0xc3, 0x1, 0xb4, 0x98, 0x2d, 0xef, 0x5a, 0xb7, 0x2, 0xc0, 0x75, 0x59, 0xec, 0x2e, 0x9b},
+ {0x0, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc, 0xaf, 0x19, 0xde, 0x68, 0x4d, 0xfb, 0x3c, 0x8a, 0x76, 0xc0, 0x7, 0xb1, 0x94, 0x22, 0xe5, 0x53, 0x43, 0xf5, 0x32, 0x84, 0xa1, 0x17, 0xd0, 0x66, 0x9a, 0x2c, 0xeb, 0x5d, 0x78, 0xce, 0x9, 0xbf, 0xec, 0x5a, 0x9d, 0x2b, 0xe, 0xb8, 0x7f, 0xc9, 0x35, 0x83, 0x44, 0xf2, 0xd7, 0x61, 0xa6, 0x10, 0x86, 0x30, 0xf7, 0x41, 0x64, 0xd2, 0x15, 0xa3, 0x5f, 0xe9, 0x2e, 0x98, 0xbd, 0xb, 0xcc, 0x7a, 0x29, 0x9f, 0x58, 0xee, 0xcb, 0x7d, 0xba, 0xc, 0xf0, 0x46, 0x81, 0x37, 0x12, 0xa4, 0x63, 0xd5, 0xc5, 0x73, 0xb4, 0x2, 0x27, 0x91, 0x56, 0xe0, 0x1c, 0xaa, 0x6d, 0xdb, 0xfe, 0x48, 0x8f, 0x39, 0x6a, 0xdc, 0x1b, 0xad, 0x88, 0x3e, 0xf9, 0x4f, 0xb3, 0x5, 0xc2, 0x74, 0x51, 0xe7, 0x20, 0x96, 0x11, 0xa7, 0x60, 0xd6, 0xf3, 0x45, 0x82, 0x34, 0xc8, 0x7e, 0xb9, 0xf, 0x2a, 0x9c, 0x5b, 0xed, 0xbe, 0x8, 0xcf, 0x79, 0x5c, 0xea, 0x2d, 0x9b, 0x67, 0xd1, 0x16, 0xa0, 0x85, 0x33, 0xf4, 0x42, 0x52, 0xe4, 0x23, 0x95, 0xb0, 0x6, 0xc1, 0x77, 0x8b, 0x3d, 0xfa, 0x4c, 0x69, 0xdf, 0x18, 0xae, 0xfd, 0x4b, 0x8c, 0x3a, 0x1f, 0xa9, 0x6e, 0xd8, 0x24, 0x92, 0x55, 0xe3, 0xc6, 0x70, 0xb7, 0x1, 0x97, 0x21, 0xe6, 0x50, 0x75, 0xc3, 0x4, 0xb2, 0x4e, 0xf8, 0x3f, 0x89, 0xac, 0x1a, 0xdd, 0x6b, 0x38, 0x8e, 0x49, 0xff, 0xda, 0x6c, 0xab, 0x1d, 0xe1, 0x57, 0x90, 0x26, 0x3, 0xb5, 0x72, 0xc4, 0xd4, 0x62, 0xa5, 0x13, 0x36, 0x80, 0x47, 0xf1, 0xd, 0xbb, 0x7c, 0xca, 0xef, 0x59, 0x9e, 0x28, 0x7b, 0xcd, 0xa, 0xbc, 0x99, 0x2f, 0xe8, 0x5e, 0xa2, 0x14, 0xd3, 0x65, 0x40, 0xf6, 0x31, 0x87},
+ {0x0, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3, 0xbf, 0x8, 0xcc, 0x7b, 0x59, 0xee, 0x2a, 0x9d, 0x6e, 0xd9, 0x1d, 0xaa, 0x88, 0x3f, 0xfb, 0x4c, 0x63, 0xd4, 0x10, 0xa7, 0x85, 0x32, 0xf6, 0x41, 0xb2, 0x5, 0xc1, 0x76, 0x54, 0xe3, 0x27, 0x90, 0xdc, 0x6b, 0xaf, 0x18, 0x3a, 0x8d, 0x49, 0xfe, 0xd, 0xba, 0x7e, 0xc9, 0xeb, 0x5c, 0x98, 0x2f, 0xc6, 0x71, 0xb5, 0x2, 0x20, 0x97, 0x53, 0xe4, 0x17, 0xa0, 0x64, 0xd3, 0xf1, 0x46, 0x82, 0x35, 0x79, 0xce, 0xa, 0xbd, 0x9f, 0x28, 0xec, 0x5b, 0xa8, 0x1f, 0xdb, 0x6c, 0x4e, 0xf9, 0x3d, 0x8a, 0xa5, 0x12, 0xd6, 0x61, 0x43, 0xf4, 0x30, 0x87, 0x74, 0xc3, 0x7, 0xb0, 0x92, 0x25, 0xe1, 0x56, 0x1a, 0xad, 0x69, 0xde, 0xfc, 0x4b, 0x8f, 0x38, 0xcb, 0x7c, 0xb8, 0xf, 0x2d, 0x9a, 0x5e, 0xe9, 0x91, 0x26, 0xe2, 0x55, 0x77, 0xc0, 0x4, 0xb3, 0x40, 0xf7, 0x33, 0x84, 0xa6, 0x11, 0xd5, 0x62, 0x2e, 0x99, 0x5d, 0xea, 0xc8, 0x7f, 0xbb, 0xc, 0xff, 0x48, 0x8c, 0x3b, 0x19, 0xae, 0x6a, 0xdd, 0xf2, 0x45, 0x81, 0x36, 0x14, 0xa3, 0x67, 0xd0, 0x23, 0x94, 0x50, 0xe7, 0xc5, 0x72, 0xb6, 0x1, 0x4d, 0xfa, 0x3e, 0x89, 0xab, 0x1c, 0xd8, 0x6f, 0x9c, 0x2b, 0xef, 0x58, 0x7a, 0xcd, 0x9, 0xbe, 0x57, 0xe0, 0x24, 0x93, 0xb1, 0x6, 0xc2, 0x75, 0x86, 0x31, 0xf5, 0x42, 0x60, 0xd7, 0x13, 0xa4, 0xe8, 0x5f, 0x9b, 0x2c, 0xe, 0xb9, 0x7d, 0xca, 0x39, 0x8e, 0x4a, 0xfd, 0xdf, 0x68, 0xac, 0x1b, 0x34, 0x83, 0x47, 0xf0, 0xd2, 0x65, 0xa1, 0x16, 0xe5, 0x52, 0x96, 0x21, 0x3, 0xb4, 0x70, 0xc7, 0x8b, 0x3c, 0xf8, 0x4f, 0x6d, 0xda, 0x1e, 0xa9, 0x5a, 0xed, 0x29, 0x9e, 0xbc, 0xb, 0xcf, 0x78},
+ {0x0, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0xf, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6, 0x4f, 0xf7, 0x22, 0x9a, 0x95, 0x2d, 0xf8, 0x40, 0xe6, 0x5e, 0x8b, 0x33, 0x3c, 0x84, 0x51, 0xe9, 0x9e, 0x26, 0xf3, 0x4b, 0x44, 0xfc, 0x29, 0x91, 0x37, 0x8f, 0x5a, 0xe2, 0xed, 0x55, 0x80, 0x38, 0xd1, 0x69, 0xbc, 0x4, 0xb, 0xb3, 0x66, 0xde, 0x78, 0xc0, 0x15, 0xad, 0xa2, 0x1a, 0xcf, 0x77, 0x21, 0x99, 0x4c, 0xf4, 0xfb, 0x43, 0x96, 0x2e, 0x88, 0x30, 0xe5, 0x5d, 0x52, 0xea, 0x3f, 0x87, 0x6e, 0xd6, 0x3, 0xbb, 0xb4, 0xc, 0xd9, 0x61, 0xc7, 0x7f, 0xaa, 0x12, 0x1d, 0xa5, 0x70, 0xc8, 0xbf, 0x7, 0xd2, 0x6a, 0x65, 0xdd, 0x8, 0xb0, 0x16, 0xae, 0x7b, 0xc3, 0xcc, 0x74, 0xa1, 0x19, 0xf0, 0x48, 0x9d, 0x25, 0x2a, 0x92, 0x47, 0xff, 0x59, 0xe1, 0x34, 0x8c, 0x83, 0x3b, 0xee, 0x56, 0x42, 0xfa, 0x2f, 0x97, 0x98, 0x20, 0xf5, 0x4d, 0xeb, 0x53, 0x86, 0x3e, 0x31, 0x89, 0x5c, 0xe4, 0xd, 0xb5, 0x60, 0xd8, 0xd7, 0x6f, 0xba, 0x2, 0xa4, 0x1c, 0xc9, 0x71, 0x7e, 0xc6, 0x13, 0xab, 0xdc, 0x64, 0xb1, 0x9, 0x6, 0xbe, 0x6b, 0xd3, 0x75, 0xcd, 0x18, 0xa0, 0xaf, 0x17, 0xc2, 0x7a, 0x93, 0x2b, 0xfe, 0x46, 0x49, 0xf1, 0x24, 0x9c, 0x3a, 0x82, 0x57, 0xef, 0xe0, 0x58, 0x8d, 0x35, 0x63, 0xdb, 0xe, 0xb6, 0xb9, 0x1, 0xd4, 0x6c, 0xca, 0x72, 0xa7, 0x1f, 0x10, 0xa8, 0x7d, 0xc5, 0x2c, 0x94, 0x41, 0xf9, 0xf6, 0x4e, 0x9b, 0x23, 0x85, 0x3d, 0xe8, 0x50, 0x5f, 0xe7, 0x32, 0x8a, 0xfd, 0x45, 0x90, 0x28, 0x27, 0x9f, 0x4a, 0xf2, 0x54, 0xec, 0x39, 0x81, 0x8e, 0x36, 0xe3, 0x5b, 0xb2, 0xa, 0xdf, 0x67, 0x68, 0xd0, 0x5, 0xbd, 0x1b, 0xa3, 0x76, 0xce, 0xc1, 0x79, 0xac, 0x14},
+ {0x0, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x8, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9, 0x5f, 0xe6, 0x30, 0x89, 0x81, 0x38, 0xee, 0x57, 0xfe, 0x47, 0x91, 0x28, 0x20, 0x99, 0x4f, 0xf6, 0xbe, 0x7, 0xd1, 0x68, 0x60, 0xd9, 0xf, 0xb6, 0x1f, 0xa6, 0x70, 0xc9, 0xc1, 0x78, 0xae, 0x17, 0xe1, 0x58, 0x8e, 0x37, 0x3f, 0x86, 0x50, 0xe9, 0x40, 0xf9, 0x2f, 0x96, 0x9e, 0x27, 0xf1, 0x48, 0x61, 0xd8, 0xe, 0xb7, 0xbf, 0x6, 0xd0, 0x69, 0xc0, 0x79, 0xaf, 0x16, 0x1e, 0xa7, 0x71, 0xc8, 0x3e, 0x87, 0x51, 0xe8, 0xe0, 0x59, 0x8f, 0x36, 0x9f, 0x26, 0xf0, 0x49, 0x41, 0xf8, 0x2e, 0x97, 0xdf, 0x66, 0xb0, 0x9, 0x1, 0xb8, 0x6e, 0xd7, 0x7e, 0xc7, 0x11, 0xa8, 0xa0, 0x19, 0xcf, 0x76, 0x80, 0x39, 0xef, 0x56, 0x5e, 0xe7, 0x31, 0x88, 0x21, 0x98, 0x4e, 0xf7, 0xff, 0x46, 0x90, 0x29, 0xc2, 0x7b, 0xad, 0x14, 0x1c, 0xa5, 0x73, 0xca, 0x63, 0xda, 0xc, 0xb5, 0xbd, 0x4, 0xd2, 0x6b, 0x9d, 0x24, 0xf2, 0x4b, 0x43, 0xfa, 0x2c, 0x95, 0x3c, 0x85, 0x53, 0xea, 0xe2, 0x5b, 0x8d, 0x34, 0x7c, 0xc5, 0x13, 0xaa, 0xa2, 0x1b, 0xcd, 0x74, 0xdd, 0x64, 0xb2, 0xb, 0x3, 0xba, 0x6c, 0xd5, 0x23, 0x9a, 0x4c, 0xf5, 0xfd, 0x44, 0x92, 0x2b, 0x82, 0x3b, 0xed, 0x54, 0x5c, 0xe5, 0x33, 0x8a, 0xa3, 0x1a, 0xcc, 0x75, 0x7d, 0xc4, 0x12, 0xab, 0x2, 0xbb, 0x6d, 0xd4, 0xdc, 0x65, 0xb3, 0xa, 0xfc, 0x45, 0x93, 0x2a, 0x22, 0x9b, 0x4d, 0xf4, 0x5d, 0xe4, 0x32, 0x8b, 0x83, 0x3a, 0xec, 0x55, 0x1d, 0xa4, 0x72, 0xcb, 0xc3, 0x7a, 0xac, 0x15, 0xbc, 0x5, 0xd3, 0x6a, 0x62, 0xdb, 0xd, 0xb4, 0x42, 0xfb, 0x2d, 0x94, 0x9c, 0x25, 0xf3, 0x4a, 0xe3, 0x5a, 0x8c, 0x35, 0x3d, 0x84, 0x52, 0xeb},
+ {0x0, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x1, 0xb9, 0x3, 0xd0, 0x6a, 0x6b, 0xd1, 0x2, 0xb8, 0x6f, 0xd5, 0x6, 0xbc, 0xbd, 0x7, 0xd4, 0x6e, 0xd6, 0x6c, 0xbf, 0x5, 0x4, 0xbe, 0x6d, 0xd7, 0xde, 0x64, 0xb7, 0xd, 0xc, 0xb6, 0x65, 0xdf, 0x67, 0xdd, 0xe, 0xb4, 0xb5, 0xf, 0xdc, 0x66, 0xb1, 0xb, 0xd8, 0x62, 0x63, 0xd9, 0xa, 0xb0, 0x8, 0xb2, 0x61, 0xdb, 0xda, 0x60, 0xb3, 0x9, 0xa1, 0x1b, 0xc8, 0x72, 0x73, 0xc9, 0x1a, 0xa0, 0x18, 0xa2, 0x71, 0xcb, 0xca, 0x70, 0xa3, 0x19, 0xce, 0x74, 0xa7, 0x1d, 0x1c, 0xa6, 0x75, 0xcf, 0x77, 0xcd, 0x1e, 0xa4, 0xa5, 0x1f, 0xcc, 0x76, 0x7f, 0xc5, 0x16, 0xac, 0xad, 0x17, 0xc4, 0x7e, 0xc6, 0x7c, 0xaf, 0x15, 0x14, 0xae, 0x7d, 0xc7, 0x10, 0xaa, 0x79, 0xc3, 0xc2, 0x78, 0xab, 0x11, 0xa9, 0x13, 0xc0, 0x7a, 0x7b, 0xc1, 0x12, 0xa8, 0x5f, 0xe5, 0x36, 0x8c, 0x8d, 0x37, 0xe4, 0x5e, 0xe6, 0x5c, 0x8f, 0x35, 0x34, 0x8e, 0x5d, 0xe7, 0x30, 0x8a, 0x59, 0xe3, 0xe2, 0x58, 0x8b, 0x31, 0x89, 0x33, 0xe0, 0x5a, 0x5b, 0xe1, 0x32, 0x88, 0x81, 0x3b, 0xe8, 0x52, 0x53, 0xe9, 0x3a, 0x80, 0x38, 0x82, 0x51, 0xeb, 0xea, 0x50, 0x83, 0x39, 0xee, 0x54, 0x87, 0x3d, 0x3c, 0x86, 0x55, 0xef, 0x57, 0xed, 0x3e, 0x84, 0x85, 0x3f, 0xec, 0x56, 0xfe, 0x44, 0x97, 0x2d, 0x2c, 0x96, 0x45, 0xff, 0x47, 0xfd, 0x2e, 0x94, 0x95, 0x2f, 0xfc, 0x46, 0x91, 0x2b, 0xf8, 0x42, 0x43, 0xf9, 0x2a, 0x90, 0x28, 0x92, 0x41, 0xfb, 0xfa, 0x40, 0x93, 0x29, 0x20, 0x9a, 0x49, 0xf3, 0xf2, 0x48, 0x9b, 0x21, 0x99, 0x23, 0xf0, 0x4a, 0x4b, 0xf1, 0x22, 0x98, 0x4f, 0xf5, 0x26, 0x9c, 0x9d, 0x27, 0xf4, 0x4e, 0xf6, 0x4c, 0x9f, 0x25, 0x24, 0x9e, 0x4d, 0xf7},
+ {0x0, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x6, 0xb1, 0xa, 0xda, 0x61, 0x67, 0xdc, 0xc, 0xb7, 0x7f, 0xc4, 0x14, 0xaf, 0xa9, 0x12, 0xc2, 0x79, 0xce, 0x75, 0xa5, 0x1e, 0x18, 0xa3, 0x73, 0xc8, 0xfe, 0x45, 0x95, 0x2e, 0x28, 0x93, 0x43, 0xf8, 0x4f, 0xf4, 0x24, 0x9f, 0x99, 0x22, 0xf2, 0x49, 0x81, 0x3a, 0xea, 0x51, 0x57, 0xec, 0x3c, 0x87, 0x30, 0x8b, 0x5b, 0xe0, 0xe6, 0x5d, 0x8d, 0x36, 0xe1, 0x5a, 0x8a, 0x31, 0x37, 0x8c, 0x5c, 0xe7, 0x50, 0xeb, 0x3b, 0x80, 0x86, 0x3d, 0xed, 0x56, 0x9e, 0x25, 0xf5, 0x4e, 0x48, 0xf3, 0x23, 0x98, 0x2f, 0x94, 0x44, 0xff, 0xf9, 0x42, 0x92, 0x29, 0x1f, 0xa4, 0x74, 0xcf, 0xc9, 0x72, 0xa2, 0x19, 0xae, 0x15, 0xc5, 0x7e, 0x78, 0xc3, 0x13, 0xa8, 0x60, 0xdb, 0xb, 0xb0, 0xb6, 0xd, 0xdd, 0x66, 0xd1, 0x6a, 0xba, 0x1, 0x7, 0xbc, 0x6c, 0xd7, 0xdf, 0x64, 0xb4, 0xf, 0x9, 0xb2, 0x62, 0xd9, 0x6e, 0xd5, 0x5, 0xbe, 0xb8, 0x3, 0xd3, 0x68, 0xa0, 0x1b, 0xcb, 0x70, 0x76, 0xcd, 0x1d, 0xa6, 0x11, 0xaa, 0x7a, 0xc1, 0xc7, 0x7c, 0xac, 0x17, 0x21, 0x9a, 0x4a, 0xf1, 0xf7, 0x4c, 0x9c, 0x27, 0x90, 0x2b, 0xfb, 0x40, 0x46, 0xfd, 0x2d, 0x96, 0x5e, 0xe5, 0x35, 0x8e, 0x88, 0x33, 0xe3, 0x58, 0xef, 0x54, 0x84, 0x3f, 0x39, 0x82, 0x52, 0xe9, 0x3e, 0x85, 0x55, 0xee, 0xe8, 0x53, 0x83, 0x38, 0x8f, 0x34, 0xe4, 0x5f, 0x59, 0xe2, 0x32, 0x89, 0x41, 0xfa, 0x2a, 0x91, 0x97, 0x2c, 0xfc, 0x47, 0xf0, 0x4b, 0x9b, 0x20, 0x26, 0x9d, 0x4d, 0xf6, 0xc0, 0x7b, 0xab, 0x10, 0x16, 0xad, 0x7d, 0xc6, 0x71, 0xca, 0x1a, 0xa1, 0xa7, 0x1c, 0xcc, 0x77, 0xbf, 0x4, 0xd4, 0x6f, 0x69, 0xd2, 0x2, 0xb9, 0xe, 0xb5, 0x65, 0xde, 0xd8, 0x63, 0xb3, 0x8},
+ {0x0, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a, 0xf, 0xb3, 0x6a, 0xd6, 0xc5, 0x79, 0xa0, 0x1c, 0x86, 0x3a, 0xe3, 0x5f, 0x4c, 0xf0, 0x29, 0x95, 0x1e, 0xa2, 0x7b, 0xc7, 0xd4, 0x68, 0xb1, 0xd, 0x97, 0x2b, 0xf2, 0x4e, 0x5d, 0xe1, 0x38, 0x84, 0x11, 0xad, 0x74, 0xc8, 0xdb, 0x67, 0xbe, 0x2, 0x98, 0x24, 0xfd, 0x41, 0x52, 0xee, 0x37, 0x8b, 0x3c, 0x80, 0x59, 0xe5, 0xf6, 0x4a, 0x93, 0x2f, 0xb5, 0x9, 0xd0, 0x6c, 0x7f, 0xc3, 0x1a, 0xa6, 0x33, 0x8f, 0x56, 0xea, 0xf9, 0x45, 0x9c, 0x20, 0xba, 0x6, 0xdf, 0x63, 0x70, 0xcc, 0x15, 0xa9, 0x22, 0x9e, 0x47, 0xfb, 0xe8, 0x54, 0x8d, 0x31, 0xab, 0x17, 0xce, 0x72, 0x61, 0xdd, 0x4, 0xb8, 0x2d, 0x91, 0x48, 0xf4, 0xe7, 0x5b, 0x82, 0x3e, 0xa4, 0x18, 0xc1, 0x7d, 0x6e, 0xd2, 0xb, 0xb7, 0x78, 0xc4, 0x1d, 0xa1, 0xb2, 0xe, 0xd7, 0x6b, 0xf1, 0x4d, 0x94, 0x28, 0x3b, 0x87, 0x5e, 0xe2, 0x77, 0xcb, 0x12, 0xae, 0xbd, 0x1, 0xd8, 0x64, 0xfe, 0x42, 0x9b, 0x27, 0x34, 0x88, 0x51, 0xed, 0x66, 0xda, 0x3, 0xbf, 0xac, 0x10, 0xc9, 0x75, 0xef, 0x53, 0x8a, 0x36, 0x25, 0x99, 0x40, 0xfc, 0x69, 0xd5, 0xc, 0xb0, 0xa3, 0x1f, 0xc6, 0x7a, 0xe0, 0x5c, 0x85, 0x39, 0x2a, 0x96, 0x4f, 0xf3, 0x44, 0xf8, 0x21, 0x9d, 0x8e, 0x32, 0xeb, 0x57, 0xcd, 0x71, 0xa8, 0x14, 0x7, 0xbb, 0x62, 0xde, 0x4b, 0xf7, 0x2e, 0x92, 0x81, 0x3d, 0xe4, 0x58, 0xc2, 0x7e, 0xa7, 0x1b, 0x8, 0xb4, 0x6d, 0xd1, 0x5a, 0xe6, 0x3f, 0x83, 0x90, 0x2c, 0xf5, 0x49, 0xd3, 0x6f, 0xb6, 0xa, 0x19, 0xa5, 0x7c, 0xc0, 0x55, 0xe9, 0x30, 0x8c, 0x9f, 0x23, 0xfa, 0x46, 0xdc, 0x60, 0xb9, 0x5, 0x16, 0xaa, 0x73, 0xcf},
+ {0x0, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95, 0x1f, 0xa2, 0x78, 0xc5, 0xd1, 0x6c, 0xb6, 0xb, 0x9e, 0x23, 0xf9, 0x44, 0x50, 0xed, 0x37, 0x8a, 0x3e, 0x83, 0x59, 0xe4, 0xf0, 0x4d, 0x97, 0x2a, 0xbf, 0x2, 0xd8, 0x65, 0x71, 0xcc, 0x16, 0xab, 0x21, 0x9c, 0x46, 0xfb, 0xef, 0x52, 0x88, 0x35, 0xa0, 0x1d, 0xc7, 0x7a, 0x6e, 0xd3, 0x9, 0xb4, 0x7c, 0xc1, 0x1b, 0xa6, 0xb2, 0xf, 0xd5, 0x68, 0xfd, 0x40, 0x9a, 0x27, 0x33, 0x8e, 0x54, 0xe9, 0x63, 0xde, 0x4, 0xb9, 0xad, 0x10, 0xca, 0x77, 0xe2, 0x5f, 0x85, 0x38, 0x2c, 0x91, 0x4b, 0xf6, 0x42, 0xff, 0x25, 0x98, 0x8c, 0x31, 0xeb, 0x56, 0xc3, 0x7e, 0xa4, 0x19, 0xd, 0xb0, 0x6a, 0xd7, 0x5d, 0xe0, 0x3a, 0x87, 0x93, 0x2e, 0xf4, 0x49, 0xdc, 0x61, 0xbb, 0x6, 0x12, 0xaf, 0x75, 0xc8, 0xf8, 0x45, 0x9f, 0x22, 0x36, 0x8b, 0x51, 0xec, 0x79, 0xc4, 0x1e, 0xa3, 0xb7, 0xa, 0xd0, 0x6d, 0xe7, 0x5a, 0x80, 0x3d, 0x29, 0x94, 0x4e, 0xf3, 0x66, 0xdb, 0x1, 0xbc, 0xa8, 0x15, 0xcf, 0x72, 0xc6, 0x7b, 0xa1, 0x1c, 0x8, 0xb5, 0x6f, 0xd2, 0x47, 0xfa, 0x20, 0x9d, 0x89, 0x34, 0xee, 0x53, 0xd9, 0x64, 0xbe, 0x3, 0x17, 0xaa, 0x70, 0xcd, 0x58, 0xe5, 0x3f, 0x82, 0x96, 0x2b, 0xf1, 0x4c, 0x84, 0x39, 0xe3, 0x5e, 0x4a, 0xf7, 0x2d, 0x90, 0x5, 0xb8, 0x62, 0xdf, 0xcb, 0x76, 0xac, 0x11, 0x9b, 0x26, 0xfc, 0x41, 0x55, 0xe8, 0x32, 0x8f, 0x1a, 0xa7, 0x7d, 0xc0, 0xd4, 0x69, 0xb3, 0xe, 0xba, 0x7, 0xdd, 0x60, 0x74, 0xc9, 0x13, 0xae, 0x3b, 0x86, 0x5c, 0xe1, 0xf5, 0x48, 0x92, 0x2f, 0xa5, 0x18, 0xc2, 0x7f, 0x6b, 0xd6, 0xc, 0xb1, 0x24, 0x99, 0x43, 0xfe, 0xea, 0x57, 0x8d, 0x30},
+ {0x0, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84, 0x2f, 0x91, 0x4e, 0xf0, 0xed, 0x53, 0x8c, 0x32, 0xb6, 0x8, 0xd7, 0x69, 0x74, 0xca, 0x15, 0xab, 0x5e, 0xe0, 0x3f, 0x81, 0x9c, 0x22, 0xfd, 0x43, 0xc7, 0x79, 0xa6, 0x18, 0x5, 0xbb, 0x64, 0xda, 0x71, 0xcf, 0x10, 0xae, 0xb3, 0xd, 0xd2, 0x6c, 0xe8, 0x56, 0x89, 0x37, 0x2a, 0x94, 0x4b, 0xf5, 0xbc, 0x2, 0xdd, 0x63, 0x7e, 0xc0, 0x1f, 0xa1, 0x25, 0x9b, 0x44, 0xfa, 0xe7, 0x59, 0x86, 0x38, 0x93, 0x2d, 0xf2, 0x4c, 0x51, 0xef, 0x30, 0x8e, 0xa, 0xb4, 0x6b, 0xd5, 0xc8, 0x76, 0xa9, 0x17, 0xe2, 0x5c, 0x83, 0x3d, 0x20, 0x9e, 0x41, 0xff, 0x7b, 0xc5, 0x1a, 0xa4, 0xb9, 0x7, 0xd8, 0x66, 0xcd, 0x73, 0xac, 0x12, 0xf, 0xb1, 0x6e, 0xd0, 0x54, 0xea, 0x35, 0x8b, 0x96, 0x28, 0xf7, 0x49, 0x65, 0xdb, 0x4, 0xba, 0xa7, 0x19, 0xc6, 0x78, 0xfc, 0x42, 0x9d, 0x23, 0x3e, 0x80, 0x5f, 0xe1, 0x4a, 0xf4, 0x2b, 0x95, 0x88, 0x36, 0xe9, 0x57, 0xd3, 0x6d, 0xb2, 0xc, 0x11, 0xaf, 0x70, 0xce, 0x3b, 0x85, 0x5a, 0xe4, 0xf9, 0x47, 0x98, 0x26, 0xa2, 0x1c, 0xc3, 0x7d, 0x60, 0xde, 0x1, 0xbf, 0x14, 0xaa, 0x75, 0xcb, 0xd6, 0x68, 0xb7, 0x9, 0x8d, 0x33, 0xec, 0x52, 0x4f, 0xf1, 0x2e, 0x90, 0xd9, 0x67, 0xb8, 0x6, 0x1b, 0xa5, 0x7a, 0xc4, 0x40, 0xfe, 0x21, 0x9f, 0x82, 0x3c, 0xe3, 0x5d, 0xf6, 0x48, 0x97, 0x29, 0x34, 0x8a, 0x55, 0xeb, 0x6f, 0xd1, 0xe, 0xb0, 0xad, 0x13, 0xcc, 0x72, 0x87, 0x39, 0xe6, 0x58, 0x45, 0xfb, 0x24, 0x9a, 0x1e, 0xa0, 0x7f, 0xc1, 0xdc, 0x62, 0xbd, 0x3, 0xa8, 0x16, 0xc9, 0x77, 0x6a, 0xd4, 0xb, 0xb5, 0x31, 0x8f, 0x50, 0xee, 0xf3, 0x4d, 0x92, 0x2c},
+ {0x0, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b, 0x3f, 0x80, 0x5c, 0xe3, 0xf9, 0x46, 0x9a, 0x25, 0xae, 0x11, 0xcd, 0x72, 0x68, 0xd7, 0xb, 0xb4, 0x7e, 0xc1, 0x1d, 0xa2, 0xb8, 0x7, 0xdb, 0x64, 0xef, 0x50, 0x8c, 0x33, 0x29, 0x96, 0x4a, 0xf5, 0x41, 0xfe, 0x22, 0x9d, 0x87, 0x38, 0xe4, 0x5b, 0xd0, 0x6f, 0xb3, 0xc, 0x16, 0xa9, 0x75, 0xca, 0xfc, 0x43, 0x9f, 0x20, 0x3a, 0x85, 0x59, 0xe6, 0x6d, 0xd2, 0xe, 0xb1, 0xab, 0x14, 0xc8, 0x77, 0xc3, 0x7c, 0xa0, 0x1f, 0x5, 0xba, 0x66, 0xd9, 0x52, 0xed, 0x31, 0x8e, 0x94, 0x2b, 0xf7, 0x48, 0x82, 0x3d, 0xe1, 0x5e, 0x44, 0xfb, 0x27, 0x98, 0x13, 0xac, 0x70, 0xcf, 0xd5, 0x6a, 0xb6, 0x9, 0xbd, 0x2, 0xde, 0x61, 0x7b, 0xc4, 0x18, 0xa7, 0x2c, 0x93, 0x4f, 0xf0, 0xea, 0x55, 0x89, 0x36, 0xe5, 0x5a, 0x86, 0x39, 0x23, 0x9c, 0x40, 0xff, 0x74, 0xcb, 0x17, 0xa8, 0xb2, 0xd, 0xd1, 0x6e, 0xda, 0x65, 0xb9, 0x6, 0x1c, 0xa3, 0x7f, 0xc0, 0x4b, 0xf4, 0x28, 0x97, 0x8d, 0x32, 0xee, 0x51, 0x9b, 0x24, 0xf8, 0x47, 0x5d, 0xe2, 0x3e, 0x81, 0xa, 0xb5, 0x69, 0xd6, 0xcc, 0x73, 0xaf, 0x10, 0xa4, 0x1b, 0xc7, 0x78, 0x62, 0xdd, 0x1, 0xbe, 0x35, 0x8a, 0x56, 0xe9, 0xf3, 0x4c, 0x90, 0x2f, 0x19, 0xa6, 0x7a, 0xc5, 0xdf, 0x60, 0xbc, 0x3, 0x88, 0x37, 0xeb, 0x54, 0x4e, 0xf1, 0x2d, 0x92, 0x26, 0x99, 0x45, 0xfa, 0xe0, 0x5f, 0x83, 0x3c, 0xb7, 0x8, 0xd4, 0x6b, 0x71, 0xce, 0x12, 0xad, 0x67, 0xd8, 0x4, 0xbb, 0xa1, 0x1e, 0xc2, 0x7d, 0xf6, 0x49, 0x95, 0x2a, 0x30, 0x8f, 0x53, 0xec, 0x58, 0xe7, 0x3b, 0x84, 0x9e, 0x21, 0xfd, 0x42, 0xc9, 0x76, 0xaa, 0x15, 0xf, 0xb0, 0x6c, 0xd3},
+ {0x0, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34, 0x9c, 0x5c, 0x1, 0xc1, 0xbb, 0x7b, 0x26, 0xe6, 0xd2, 0x12, 0x4f, 0x8f, 0xf5, 0x35, 0x68, 0xa8, 0x25, 0xe5, 0xb8, 0x78, 0x2, 0xc2, 0x9f, 0x5f, 0x6b, 0xab, 0xf6, 0x36, 0x4c, 0x8c, 0xd1, 0x11, 0xb9, 0x79, 0x24, 0xe4, 0x9e, 0x5e, 0x3, 0xc3, 0xf7, 0x37, 0x6a, 0xaa, 0xd0, 0x10, 0x4d, 0x8d, 0x4a, 0x8a, 0xd7, 0x17, 0x6d, 0xad, 0xf0, 0x30, 0x4, 0xc4, 0x99, 0x59, 0x23, 0xe3, 0xbe, 0x7e, 0xd6, 0x16, 0x4b, 0x8b, 0xf1, 0x31, 0x6c, 0xac, 0x98, 0x58, 0x5, 0xc5, 0xbf, 0x7f, 0x22, 0xe2, 0x6f, 0xaf, 0xf2, 0x32, 0x48, 0x88, 0xd5, 0x15, 0x21, 0xe1, 0xbc, 0x7c, 0x6, 0xc6, 0x9b, 0x5b, 0xf3, 0x33, 0x6e, 0xae, 0xd4, 0x14, 0x49, 0x89, 0xbd, 0x7d, 0x20, 0xe0, 0x9a, 0x5a, 0x7, 0xc7, 0x94, 0x54, 0x9, 0xc9, 0xb3, 0x73, 0x2e, 0xee, 0xda, 0x1a, 0x47, 0x87, 0xfd, 0x3d, 0x60, 0xa0, 0x8, 0xc8, 0x95, 0x55, 0x2f, 0xef, 0xb2, 0x72, 0x46, 0x86, 0xdb, 0x1b, 0x61, 0xa1, 0xfc, 0x3c, 0xb1, 0x71, 0x2c, 0xec, 0x96, 0x56, 0xb, 0xcb, 0xff, 0x3f, 0x62, 0xa2, 0xd8, 0x18, 0x45, 0x85, 0x2d, 0xed, 0xb0, 0x70, 0xa, 0xca, 0x97, 0x57, 0x63, 0xa3, 0xfe, 0x3e, 0x44, 0x84, 0xd9, 0x19, 0xde, 0x1e, 0x43, 0x83, 0xf9, 0x39, 0x64, 0xa4, 0x90, 0x50, 0xd, 0xcd, 0xb7, 0x77, 0x2a, 0xea, 0x42, 0x82, 0xdf, 0x1f, 0x65, 0xa5, 0xf8, 0x38, 0xc, 0xcc, 0x91, 0x51, 0x2b, 0xeb, 0xb6, 0x76, 0xfb, 0x3b, 0x66, 0xa6, 0xdc, 0x1c, 0x41, 0x81, 0xb5, 0x75, 0x28, 0xe8, 0x92, 0x52, 0xf, 0xcf, 0x67, 0xa7, 0xfa, 0x3a, 0x40, 0x80, 0xdd, 0x1d, 0x29, 0xe9, 0xb4, 0x74, 0xe, 0xce, 0x93, 0x53},
+ {0x0, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b, 0x8c, 0x4d, 0x13, 0xd2, 0xaf, 0x6e, 0x30, 0xf1, 0xca, 0xb, 0x55, 0x94, 0xe9, 0x28, 0x76, 0xb7, 0x5, 0xc4, 0x9a, 0x5b, 0x26, 0xe7, 0xb9, 0x78, 0x43, 0x82, 0xdc, 0x1d, 0x60, 0xa1, 0xff, 0x3e, 0x89, 0x48, 0x16, 0xd7, 0xaa, 0x6b, 0x35, 0xf4, 0xcf, 0xe, 0x50, 0x91, 0xec, 0x2d, 0x73, 0xb2, 0xa, 0xcb, 0x95, 0x54, 0x29, 0xe8, 0xb6, 0x77, 0x4c, 0x8d, 0xd3, 0x12, 0x6f, 0xae, 0xf0, 0x31, 0x86, 0x47, 0x19, 0xd8, 0xa5, 0x64, 0x3a, 0xfb, 0xc0, 0x1, 0x5f, 0x9e, 0xe3, 0x22, 0x7c, 0xbd, 0xf, 0xce, 0x90, 0x51, 0x2c, 0xed, 0xb3, 0x72, 0x49, 0x88, 0xd6, 0x17, 0x6a, 0xab, 0xf5, 0x34, 0x83, 0x42, 0x1c, 0xdd, 0xa0, 0x61, 0x3f, 0xfe, 0xc5, 0x4, 0x5a, 0x9b, 0xe6, 0x27, 0x79, 0xb8, 0x14, 0xd5, 0x8b, 0x4a, 0x37, 0xf6, 0xa8, 0x69, 0x52, 0x93, 0xcd, 0xc, 0x71, 0xb0, 0xee, 0x2f, 0x98, 0x59, 0x7, 0xc6, 0xbb, 0x7a, 0x24, 0xe5, 0xde, 0x1f, 0x41, 0x80, 0xfd, 0x3c, 0x62, 0xa3, 0x11, 0xd0, 0x8e, 0x4f, 0x32, 0xf3, 0xad, 0x6c, 0x57, 0x96, 0xc8, 0x9, 0x74, 0xb5, 0xeb, 0x2a, 0x9d, 0x5c, 0x2, 0xc3, 0xbe, 0x7f, 0x21, 0xe0, 0xdb, 0x1a, 0x44, 0x85, 0xf8, 0x39, 0x67, 0xa6, 0x1e, 0xdf, 0x81, 0x40, 0x3d, 0xfc, 0xa2, 0x63, 0x58, 0x99, 0xc7, 0x6, 0x7b, 0xba, 0xe4, 0x25, 0x92, 0x53, 0xd, 0xcc, 0xb1, 0x70, 0x2e, 0xef, 0xd4, 0x15, 0x4b, 0x8a, 0xf7, 0x36, 0x68, 0xa9, 0x1b, 0xda, 0x84, 0x45, 0x38, 0xf9, 0xa7, 0x66, 0x5d, 0x9c, 0xc2, 0x3, 0x7e, 0xbf, 0xe1, 0x20, 0x97, 0x56, 0x8, 0xc9, 0xb4, 0x75, 0x2b, 0xea, 0xd1, 0x10, 0x4e, 0x8f, 0xf2, 0x33, 0x6d, 0xac},
+ {0x0, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x5, 0x71, 0xb3, 0xe8, 0x2a, 0xbc, 0x7e, 0x25, 0xe7, 0x93, 0x51, 0xa, 0xc8, 0xe2, 0x20, 0x7b, 0xb9, 0xcd, 0xf, 0x54, 0x96, 0x65, 0xa7, 0xfc, 0x3e, 0x4a, 0x88, 0xd3, 0x11, 0x3b, 0xf9, 0xa2, 0x60, 0x14, 0xd6, 0x8d, 0x4f, 0xd9, 0x1b, 0x40, 0x82, 0xf6, 0x34, 0x6f, 0xad, 0x87, 0x45, 0x1e, 0xdc, 0xa8, 0x6a, 0x31, 0xf3, 0xca, 0x8, 0x53, 0x91, 0xe5, 0x27, 0x7c, 0xbe, 0x94, 0x56, 0xd, 0xcf, 0xbb, 0x79, 0x22, 0xe0, 0x76, 0xb4, 0xef, 0x2d, 0x59, 0x9b, 0xc0, 0x2, 0x28, 0xea, 0xb1, 0x73, 0x7, 0xc5, 0x9e, 0x5c, 0xaf, 0x6d, 0x36, 0xf4, 0x80, 0x42, 0x19, 0xdb, 0xf1, 0x33, 0x68, 0xaa, 0xde, 0x1c, 0x47, 0x85, 0x13, 0xd1, 0x8a, 0x48, 0x3c, 0xfe, 0xa5, 0x67, 0x4d, 0x8f, 0xd4, 0x16, 0x62, 0xa0, 0xfb, 0x39, 0x89, 0x4b, 0x10, 0xd2, 0xa6, 0x64, 0x3f, 0xfd, 0xd7, 0x15, 0x4e, 0x8c, 0xf8, 0x3a, 0x61, 0xa3, 0x35, 0xf7, 0xac, 0x6e, 0x1a, 0xd8, 0x83, 0x41, 0x6b, 0xa9, 0xf2, 0x30, 0x44, 0x86, 0xdd, 0x1f, 0xec, 0x2e, 0x75, 0xb7, 0xc3, 0x1, 0x5a, 0x98, 0xb2, 0x70, 0x2b, 0xe9, 0x9d, 0x5f, 0x4, 0xc6, 0x50, 0x92, 0xc9, 0xb, 0x7f, 0xbd, 0xe6, 0x24, 0xe, 0xcc, 0x97, 0x55, 0x21, 0xe3, 0xb8, 0x7a, 0x43, 0x81, 0xda, 0x18, 0x6c, 0xae, 0xf5, 0x37, 0x1d, 0xdf, 0x84, 0x46, 0x32, 0xf0, 0xab, 0x69, 0xff, 0x3d, 0x66, 0xa4, 0xd0, 0x12, 0x49, 0x8b, 0xa1, 0x63, 0x38, 0xfa, 0x8e, 0x4c, 0x17, 0xd5, 0x26, 0xe4, 0xbf, 0x7d, 0x9, 0xcb, 0x90, 0x52, 0x78, 0xba, 0xe1, 0x23, 0x57, 0x95, 0xce, 0xc, 0x9a, 0x58, 0x3, 0xc1, 0xb5, 0x77, 0x2c, 0xee, 0xc4, 0x6, 0x5d, 0x9f, 0xeb, 0x29, 0x72, 0xb0},
+ {0x0, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0xe, 0x7d, 0xbe, 0xe6, 0x25, 0xac, 0x6f, 0x37, 0xf4, 0x87, 0x44, 0x1c, 0xdf, 0xfa, 0x39, 0x61, 0xa2, 0xd1, 0x12, 0x4a, 0x89, 0x45, 0x86, 0xde, 0x1d, 0x6e, 0xad, 0xf5, 0x36, 0x13, 0xd0, 0x88, 0x4b, 0x38, 0xfb, 0xa3, 0x60, 0xe9, 0x2a, 0x72, 0xb1, 0xc2, 0x1, 0x59, 0x9a, 0xbf, 0x7c, 0x24, 0xe7, 0x94, 0x57, 0xf, 0xcc, 0x8a, 0x49, 0x11, 0xd2, 0xa1, 0x62, 0x3a, 0xf9, 0xdc, 0x1f, 0x47, 0x84, 0xf7, 0x34, 0x6c, 0xaf, 0x26, 0xe5, 0xbd, 0x7e, 0xd, 0xce, 0x96, 0x55, 0x70, 0xb3, 0xeb, 0x28, 0x5b, 0x98, 0xc0, 0x3, 0xcf, 0xc, 0x54, 0x97, 0xe4, 0x27, 0x7f, 0xbc, 0x99, 0x5a, 0x2, 0xc1, 0xb2, 0x71, 0x29, 0xea, 0x63, 0xa0, 0xf8, 0x3b, 0x48, 0x8b, 0xd3, 0x10, 0x35, 0xf6, 0xae, 0x6d, 0x1e, 0xdd, 0x85, 0x46, 0x9, 0xca, 0x92, 0x51, 0x22, 0xe1, 0xb9, 0x7a, 0x5f, 0x9c, 0xc4, 0x7, 0x74, 0xb7, 0xef, 0x2c, 0xa5, 0x66, 0x3e, 0xfd, 0x8e, 0x4d, 0x15, 0xd6, 0xf3, 0x30, 0x68, 0xab, 0xd8, 0x1b, 0x43, 0x80, 0x4c, 0x8f, 0xd7, 0x14, 0x67, 0xa4, 0xfc, 0x3f, 0x1a, 0xd9, 0x81, 0x42, 0x31, 0xf2, 0xaa, 0x69, 0xe0, 0x23, 0x7b, 0xb8, 0xcb, 0x8, 0x50, 0x93, 0xb6, 0x75, 0x2d, 0xee, 0x9d, 0x5e, 0x6, 0xc5, 0x83, 0x40, 0x18, 0xdb, 0xa8, 0x6b, 0x33, 0xf0, 0xd5, 0x16, 0x4e, 0x8d, 0xfe, 0x3d, 0x65, 0xa6, 0x2f, 0xec, 0xb4, 0x77, 0x4, 0xc7, 0x9f, 0x5c, 0x79, 0xba, 0xe2, 0x21, 0x52, 0x91, 0xc9, 0xa, 0xc6, 0x5, 0x5d, 0x9e, 0xed, 0x2e, 0x76, 0xb5, 0x90, 0x53, 0xb, 0xc8, 0xbb, 0x78, 0x20, 0xe3, 0x6a, 0xa9, 0xf1, 0x32, 0x41, 0x82, 0xda, 0x19, 0x3c, 0xff, 0xa7, 0x64, 0x17, 0xd4, 0x8c, 0x4f},
+ {0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x8, 0xdc, 0x18, 0x49, 0x8d, 0xeb, 0x2f, 0x7e, 0xba, 0xb2, 0x76, 0x27, 0xe3, 0x85, 0x41, 0x10, 0xd4, 0xa5, 0x61, 0x30, 0xf4, 0x92, 0x56, 0x7, 0xc3, 0xcb, 0xf, 0x5e, 0x9a, 0xfc, 0x38, 0x69, 0xad, 0x79, 0xbd, 0xec, 0x28, 0x4e, 0x8a, 0xdb, 0x1f, 0x17, 0xd3, 0x82, 0x46, 0x20, 0xe4, 0xb5, 0x71, 0x57, 0x93, 0xc2, 0x6, 0x60, 0xa4, 0xf5, 0x31, 0x39, 0xfd, 0xac, 0x68, 0xe, 0xca, 0x9b, 0x5f, 0x8b, 0x4f, 0x1e, 0xda, 0xbc, 0x78, 0x29, 0xed, 0xe5, 0x21, 0x70, 0xb4, 0xd2, 0x16, 0x47, 0x83, 0xf2, 0x36, 0x67, 0xa3, 0xc5, 0x1, 0x50, 0x94, 0x9c, 0x58, 0x9, 0xcd, 0xab, 0x6f, 0x3e, 0xfa, 0x2e, 0xea, 0xbb, 0x7f, 0x19, 0xdd, 0x8c, 0x48, 0x40, 0x84, 0xd5, 0x11, 0x77, 0xb3, 0xe2, 0x26, 0xae, 0x6a, 0x3b, 0xff, 0x99, 0x5d, 0xc, 0xc8, 0xc0, 0x4, 0x55, 0x91, 0xf7, 0x33, 0x62, 0xa6, 0x72, 0xb6, 0xe7, 0x23, 0x45, 0x81, 0xd0, 0x14, 0x1c, 0xd8, 0x89, 0x4d, 0x2b, 0xef, 0xbe, 0x7a, 0xb, 0xcf, 0x9e, 0x5a, 0x3c, 0xf8, 0xa9, 0x6d, 0x65, 0xa1, 0xf0, 0x34, 0x52, 0x96, 0xc7, 0x3, 0xd7, 0x13, 0x42, 0x86, 0xe0, 0x24, 0x75, 0xb1, 0xb9, 0x7d, 0x2c, 0xe8, 0x8e, 0x4a, 0x1b, 0xdf, 0xf9, 0x3d, 0x6c, 0xa8, 0xce, 0xa, 0x5b, 0x9f, 0x97, 0x53, 0x2, 0xc6, 0xa0, 0x64, 0x35, 0xf1, 0x25, 0xe1, 0xb0, 0x74, 0x12, 0xd6, 0x87, 0x43, 0x4b, 0x8f, 0xde, 0x1a, 0x7c, 0xb8, 0xe9, 0x2d, 0x5c, 0x98, 0xc9, 0xd, 0x6b, 0xaf, 0xfe, 0x3a, 0x32, 0xf6, 0xa7, 0x63, 0x5, 0xc1, 0x90, 0x54, 0x80, 0x44, 0x15, 0xd1, 0xb7, 0x73, 0x22, 0xe6, 0xee, 0x2a, 0x7b, 0xbf, 0xd9, 0x1d, 0x4c, 0x88},
+ {0x0, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x7, 0xcc, 0x9, 0x5b, 0x9e, 0xff, 0x3a, 0x68, 0xad, 0xaa, 0x6f, 0x3d, 0xf8, 0x99, 0x5c, 0xe, 0xcb, 0x85, 0x40, 0x12, 0xd7, 0xb6, 0x73, 0x21, 0xe4, 0xe3, 0x26, 0x74, 0xb1, 0xd0, 0x15, 0x47, 0x82, 0x49, 0x8c, 0xde, 0x1b, 0x7a, 0xbf, 0xed, 0x28, 0x2f, 0xea, 0xb8, 0x7d, 0x1c, 0xd9, 0x8b, 0x4e, 0x17, 0xd2, 0x80, 0x45, 0x24, 0xe1, 0xb3, 0x76, 0x71, 0xb4, 0xe6, 0x23, 0x42, 0x87, 0xd5, 0x10, 0xdb, 0x1e, 0x4c, 0x89, 0xe8, 0x2d, 0x7f, 0xba, 0xbd, 0x78, 0x2a, 0xef, 0x8e, 0x4b, 0x19, 0xdc, 0x92, 0x57, 0x5, 0xc0, 0xa1, 0x64, 0x36, 0xf3, 0xf4, 0x31, 0x63, 0xa6, 0xc7, 0x2, 0x50, 0x95, 0x5e, 0x9b, 0xc9, 0xc, 0x6d, 0xa8, 0xfa, 0x3f, 0x38, 0xfd, 0xaf, 0x6a, 0xb, 0xce, 0x9c, 0x59, 0x2e, 0xeb, 0xb9, 0x7c, 0x1d, 0xd8, 0x8a, 0x4f, 0x48, 0x8d, 0xdf, 0x1a, 0x7b, 0xbe, 0xec, 0x29, 0xe2, 0x27, 0x75, 0xb0, 0xd1, 0x14, 0x46, 0x83, 0x84, 0x41, 0x13, 0xd6, 0xb7, 0x72, 0x20, 0xe5, 0xab, 0x6e, 0x3c, 0xf9, 0x98, 0x5d, 0xf, 0xca, 0xcd, 0x8, 0x5a, 0x9f, 0xfe, 0x3b, 0x69, 0xac, 0x67, 0xa2, 0xf0, 0x35, 0x54, 0x91, 0xc3, 0x6, 0x1, 0xc4, 0x96, 0x53, 0x32, 0xf7, 0xa5, 0x60, 0x39, 0xfc, 0xae, 0x6b, 0xa, 0xcf, 0x9d, 0x58, 0x5f, 0x9a, 0xc8, 0xd, 0x6c, 0xa9, 0xfb, 0x3e, 0xf5, 0x30, 0x62, 0xa7, 0xc6, 0x3, 0x51, 0x94, 0x93, 0x56, 0x4, 0xc1, 0xa0, 0x65, 0x37, 0xf2, 0xbc, 0x79, 0x2b, 0xee, 0x8f, 0x4a, 0x18, 0xdd, 0xda, 0x1f, 0x4d, 0x88, 0xe9, 0x2c, 0x7e, 0xbb, 0x70, 0xb5, 0xe7, 0x22, 0x43, 0x86, 0xd4, 0x11, 0x16, 0xd3, 0x81, 0x44, 0x25, 0xe0, 0xb2, 0x77},
+ {0x0, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16, 0xfc, 0x3a, 0x6d, 0xab, 0xc3, 0x5, 0x52, 0x94, 0x82, 0x44, 0x13, 0xd5, 0xbd, 0x7b, 0x2c, 0xea, 0xe5, 0x23, 0x74, 0xb2, 0xda, 0x1c, 0x4b, 0x8d, 0x9b, 0x5d, 0xa, 0xcc, 0xa4, 0x62, 0x35, 0xf3, 0x19, 0xdf, 0x88, 0x4e, 0x26, 0xe0, 0xb7, 0x71, 0x67, 0xa1, 0xf6, 0x30, 0x58, 0x9e, 0xc9, 0xf, 0xd7, 0x11, 0x46, 0x80, 0xe8, 0x2e, 0x79, 0xbf, 0xa9, 0x6f, 0x38, 0xfe, 0x96, 0x50, 0x7, 0xc1, 0x2b, 0xed, 0xba, 0x7c, 0x14, 0xd2, 0x85, 0x43, 0x55, 0x93, 0xc4, 0x2, 0x6a, 0xac, 0xfb, 0x3d, 0x32, 0xf4, 0xa3, 0x65, 0xd, 0xcb, 0x9c, 0x5a, 0x4c, 0x8a, 0xdd, 0x1b, 0x73, 0xb5, 0xe2, 0x24, 0xce, 0x8, 0x5f, 0x99, 0xf1, 0x37, 0x60, 0xa6, 0xb0, 0x76, 0x21, 0xe7, 0x8f, 0x49, 0x1e, 0xd8, 0xb3, 0x75, 0x22, 0xe4, 0x8c, 0x4a, 0x1d, 0xdb, 0xcd, 0xb, 0x5c, 0x9a, 0xf2, 0x34, 0x63, 0xa5, 0x4f, 0x89, 0xde, 0x18, 0x70, 0xb6, 0xe1, 0x27, 0x31, 0xf7, 0xa0, 0x66, 0xe, 0xc8, 0x9f, 0x59, 0x56, 0x90, 0xc7, 0x1, 0x69, 0xaf, 0xf8, 0x3e, 0x28, 0xee, 0xb9, 0x7f, 0x17, 0xd1, 0x86, 0x40, 0xaa, 0x6c, 0x3b, 0xfd, 0x95, 0x53, 0x4, 0xc2, 0xd4, 0x12, 0x45, 0x83, 0xeb, 0x2d, 0x7a, 0xbc, 0x64, 0xa2, 0xf5, 0x33, 0x5b, 0x9d, 0xca, 0xc, 0x1a, 0xdc, 0x8b, 0x4d, 0x25, 0xe3, 0xb4, 0x72, 0x98, 0x5e, 0x9, 0xcf, 0xa7, 0x61, 0x36, 0xf0, 0xe6, 0x20, 0x77, 0xb1, 0xd9, 0x1f, 0x48, 0x8e, 0x81, 0x47, 0x10, 0xd6, 0xbe, 0x78, 0x2f, 0xe9, 0xff, 0x39, 0x6e, 0xa8, 0xc0, 0x6, 0x51, 0x97, 0x7d, 0xbb, 0xec, 0x2a, 0x42, 0x84, 0xd3, 0x15, 0x3, 0xc5, 0x92, 0x54, 0x3c, 0xfa, 0xad, 0x6b},
+ {0x0, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19, 0xec, 0x2b, 0x7f, 0xb8, 0xd7, 0x10, 0x44, 0x83, 0x9a, 0x5d, 0x9, 0xce, 0xa1, 0x66, 0x32, 0xf5, 0xc5, 0x2, 0x56, 0x91, 0xfe, 0x39, 0x6d, 0xaa, 0xb3, 0x74, 0x20, 0xe7, 0x88, 0x4f, 0x1b, 0xdc, 0x29, 0xee, 0xba, 0x7d, 0x12, 0xd5, 0x81, 0x46, 0x5f, 0x98, 0xcc, 0xb, 0x64, 0xa3, 0xf7, 0x30, 0x97, 0x50, 0x4, 0xc3, 0xac, 0x6b, 0x3f, 0xf8, 0xe1, 0x26, 0x72, 0xb5, 0xda, 0x1d, 0x49, 0x8e, 0x7b, 0xbc, 0xe8, 0x2f, 0x40, 0x87, 0xd3, 0x14, 0xd, 0xca, 0x9e, 0x59, 0x36, 0xf1, 0xa5, 0x62, 0x52, 0x95, 0xc1, 0x6, 0x69, 0xae, 0xfa, 0x3d, 0x24, 0xe3, 0xb7, 0x70, 0x1f, 0xd8, 0x8c, 0x4b, 0xbe, 0x79, 0x2d, 0xea, 0x85, 0x42, 0x16, 0xd1, 0xc8, 0xf, 0x5b, 0x9c, 0xf3, 0x34, 0x60, 0xa7, 0x33, 0xf4, 0xa0, 0x67, 0x8, 0xcf, 0x9b, 0x5c, 0x45, 0x82, 0xd6, 0x11, 0x7e, 0xb9, 0xed, 0x2a, 0xdf, 0x18, 0x4c, 0x8b, 0xe4, 0x23, 0x77, 0xb0, 0xa9, 0x6e, 0x3a, 0xfd, 0x92, 0x55, 0x1, 0xc6, 0xf6, 0x31, 0x65, 0xa2, 0xcd, 0xa, 0x5e, 0x99, 0x80, 0x47, 0x13, 0xd4, 0xbb, 0x7c, 0x28, 0xef, 0x1a, 0xdd, 0x89, 0x4e, 0x21, 0xe6, 0xb2, 0x75, 0x6c, 0xab, 0xff, 0x38, 0x57, 0x90, 0xc4, 0x3, 0xa4, 0x63, 0x37, 0xf0, 0x9f, 0x58, 0xc, 0xcb, 0xd2, 0x15, 0x41, 0x86, 0xe9, 0x2e, 0x7a, 0xbd, 0x48, 0x8f, 0xdb, 0x1c, 0x73, 0xb4, 0xe0, 0x27, 0x3e, 0xf9, 0xad, 0x6a, 0x5, 0xc2, 0x96, 0x51, 0x61, 0xa6, 0xf2, 0x35, 0x5a, 0x9d, 0xc9, 0xe, 0x17, 0xd0, 0x84, 0x43, 0x2c, 0xeb, 0xbf, 0x78, 0x8d, 0x4a, 0x1e, 0xd9, 0xb6, 0x71, 0x25, 0xe2, 0xfb, 0x3c, 0x68, 0xaf, 0xc0, 0x7, 0x53, 0x94},
+ {0x0, 0xc8, 0x8d, 0x45, 0x7, 0xcf, 0x8a, 0x42, 0xe, 0xc6, 0x83, 0x4b, 0x9, 0xc1, 0x84, 0x4c, 0x1c, 0xd4, 0x91, 0x59, 0x1b, 0xd3, 0x96, 0x5e, 0x12, 0xda, 0x9f, 0x57, 0x15, 0xdd, 0x98, 0x50, 0x38, 0xf0, 0xb5, 0x7d, 0x3f, 0xf7, 0xb2, 0x7a, 0x36, 0xfe, 0xbb, 0x73, 0x31, 0xf9, 0xbc, 0x74, 0x24, 0xec, 0xa9, 0x61, 0x23, 0xeb, 0xae, 0x66, 0x2a, 0xe2, 0xa7, 0x6f, 0x2d, 0xe5, 0xa0, 0x68, 0x70, 0xb8, 0xfd, 0x35, 0x77, 0xbf, 0xfa, 0x32, 0x7e, 0xb6, 0xf3, 0x3b, 0x79, 0xb1, 0xf4, 0x3c, 0x6c, 0xa4, 0xe1, 0x29, 0x6b, 0xa3, 0xe6, 0x2e, 0x62, 0xaa, 0xef, 0x27, 0x65, 0xad, 0xe8, 0x20, 0x48, 0x80, 0xc5, 0xd, 0x4f, 0x87, 0xc2, 0xa, 0x46, 0x8e, 0xcb, 0x3, 0x41, 0x89, 0xcc, 0x4, 0x54, 0x9c, 0xd9, 0x11, 0x53, 0x9b, 0xde, 0x16, 0x5a, 0x92, 0xd7, 0x1f, 0x5d, 0x95, 0xd0, 0x18, 0xe0, 0x28, 0x6d, 0xa5, 0xe7, 0x2f, 0x6a, 0xa2, 0xee, 0x26, 0x63, 0xab, 0xe9, 0x21, 0x64, 0xac, 0xfc, 0x34, 0x71, 0xb9, 0xfb, 0x33, 0x76, 0xbe, 0xf2, 0x3a, 0x7f, 0xb7, 0xf5, 0x3d, 0x78, 0xb0, 0xd8, 0x10, 0x55, 0x9d, 0xdf, 0x17, 0x52, 0x9a, 0xd6, 0x1e, 0x5b, 0x93, 0xd1, 0x19, 0x5c, 0x94, 0xc4, 0xc, 0x49, 0x81, 0xc3, 0xb, 0x4e, 0x86, 0xca, 0x2, 0x47, 0x8f, 0xcd, 0x5, 0x40, 0x88, 0x90, 0x58, 0x1d, 0xd5, 0x97, 0x5f, 0x1a, 0xd2, 0x9e, 0x56, 0x13, 0xdb, 0x99, 0x51, 0x14, 0xdc, 0x8c, 0x44, 0x1, 0xc9, 0x8b, 0x43, 0x6, 0xce, 0x82, 0x4a, 0xf, 0xc7, 0x85, 0x4d, 0x8, 0xc0, 0xa8, 0x60, 0x25, 0xed, 0xaf, 0x67, 0x22, 0xea, 0xa6, 0x6e, 0x2b, 0xe3, 0xa1, 0x69, 0x2c, 0xe4, 0xb4, 0x7c, 0x39, 0xf1, 0xb3, 0x7b, 0x3e, 0xf6, 0xba, 0x72, 0x37, 0xff, 0xbd, 0x75, 0x30, 0xf8},
+ {0x0, 0xc9, 0x8f, 0x46, 0x3, 0xca, 0x8c, 0x45, 0x6, 0xcf, 0x89, 0x40, 0x5, 0xcc, 0x8a, 0x43, 0xc, 0xc5, 0x83, 0x4a, 0xf, 0xc6, 0x80, 0x49, 0xa, 0xc3, 0x85, 0x4c, 0x9, 0xc0, 0x86, 0x4f, 0x18, 0xd1, 0x97, 0x5e, 0x1b, 0xd2, 0x94, 0x5d, 0x1e, 0xd7, 0x91, 0x58, 0x1d, 0xd4, 0x92, 0x5b, 0x14, 0xdd, 0x9b, 0x52, 0x17, 0xde, 0x98, 0x51, 0x12, 0xdb, 0x9d, 0x54, 0x11, 0xd8, 0x9e, 0x57, 0x30, 0xf9, 0xbf, 0x76, 0x33, 0xfa, 0xbc, 0x75, 0x36, 0xff, 0xb9, 0x70, 0x35, 0xfc, 0xba, 0x73, 0x3c, 0xf5, 0xb3, 0x7a, 0x3f, 0xf6, 0xb0, 0x79, 0x3a, 0xf3, 0xb5, 0x7c, 0x39, 0xf0, 0xb6, 0x7f, 0x28, 0xe1, 0xa7, 0x6e, 0x2b, 0xe2, 0xa4, 0x6d, 0x2e, 0xe7, 0xa1, 0x68, 0x2d, 0xe4, 0xa2, 0x6b, 0x24, 0xed, 0xab, 0x62, 0x27, 0xee, 0xa8, 0x61, 0x22, 0xeb, 0xad, 0x64, 0x21, 0xe8, 0xae, 0x67, 0x60, 0xa9, 0xef, 0x26, 0x63, 0xaa, 0xec, 0x25, 0x66, 0xaf, 0xe9, 0x20, 0x65, 0xac, 0xea, 0x23, 0x6c, 0xa5, 0xe3, 0x2a, 0x6f, 0xa6, 0xe0, 0x29, 0x6a, 0xa3, 0xe5, 0x2c, 0x69, 0xa0, 0xe6, 0x2f, 0x78, 0xb1, 0xf7, 0x3e, 0x7b, 0xb2, 0xf4, 0x3d, 0x7e, 0xb7, 0xf1, 0x38, 0x7d, 0xb4, 0xf2, 0x3b, 0x74, 0xbd, 0xfb, 0x32, 0x77, 0xbe, 0xf8, 0x31, 0x72, 0xbb, 0xfd, 0x34, 0x71, 0xb8, 0xfe, 0x37, 0x50, 0x99, 0xdf, 0x16, 0x53, 0x9a, 0xdc, 0x15, 0x56, 0x9f, 0xd9, 0x10, 0x55, 0x9c, 0xda, 0x13, 0x5c, 0x95, 0xd3, 0x1a, 0x5f, 0x96, 0xd0, 0x19, 0x5a, 0x93, 0xd5, 0x1c, 0x59, 0x90, 0xd6, 0x1f, 0x48, 0x81, 0xc7, 0xe, 0x4b, 0x82, 0xc4, 0xd, 0x4e, 0x87, 0xc1, 0x8, 0x4d, 0x84, 0xc2, 0xb, 0x44, 0x8d, 0xcb, 0x2, 0x47, 0x8e, 0xc8, 0x1, 0x42, 0x8b, 0xcd, 0x4, 0x41, 0x88, 0xce, 0x7},
+ {0x0, 0xca, 0x89, 0x43, 0xf, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52, 0x3c, 0xf6, 0xb5, 0x7f, 0x33, 0xf9, 0xba, 0x70, 0x22, 0xe8, 0xab, 0x61, 0x2d, 0xe7, 0xa4, 0x6e, 0x78, 0xb2, 0xf1, 0x3b, 0x77, 0xbd, 0xfe, 0x34, 0x66, 0xac, 0xef, 0x25, 0x69, 0xa3, 0xe0, 0x2a, 0x44, 0x8e, 0xcd, 0x7, 0x4b, 0x81, 0xc2, 0x8, 0x5a, 0x90, 0xd3, 0x19, 0x55, 0x9f, 0xdc, 0x16, 0xf0, 0x3a, 0x79, 0xb3, 0xff, 0x35, 0x76, 0xbc, 0xee, 0x24, 0x67, 0xad, 0xe1, 0x2b, 0x68, 0xa2, 0xcc, 0x6, 0x45, 0x8f, 0xc3, 0x9, 0x4a, 0x80, 0xd2, 0x18, 0x5b, 0x91, 0xdd, 0x17, 0x54, 0x9e, 0x88, 0x42, 0x1, 0xcb, 0x87, 0x4d, 0xe, 0xc4, 0x96, 0x5c, 0x1f, 0xd5, 0x99, 0x53, 0x10, 0xda, 0xb4, 0x7e, 0x3d, 0xf7, 0xbb, 0x71, 0x32, 0xf8, 0xaa, 0x60, 0x23, 0xe9, 0xa5, 0x6f, 0x2c, 0xe6, 0xfd, 0x37, 0x74, 0xbe, 0xf2, 0x38, 0x7b, 0xb1, 0xe3, 0x29, 0x6a, 0xa0, 0xec, 0x26, 0x65, 0xaf, 0xc1, 0xb, 0x48, 0x82, 0xce, 0x4, 0x47, 0x8d, 0xdf, 0x15, 0x56, 0x9c, 0xd0, 0x1a, 0x59, 0x93, 0x85, 0x4f, 0xc, 0xc6, 0x8a, 0x40, 0x3, 0xc9, 0x9b, 0x51, 0x12, 0xd8, 0x94, 0x5e, 0x1d, 0xd7, 0xb9, 0x73, 0x30, 0xfa, 0xb6, 0x7c, 0x3f, 0xf5, 0xa7, 0x6d, 0x2e, 0xe4, 0xa8, 0x62, 0x21, 0xeb, 0xd, 0xc7, 0x84, 0x4e, 0x2, 0xc8, 0x8b, 0x41, 0x13, 0xd9, 0x9a, 0x50, 0x1c, 0xd6, 0x95, 0x5f, 0x31, 0xfb, 0xb8, 0x72, 0x3e, 0xf4, 0xb7, 0x7d, 0x2f, 0xe5, 0xa6, 0x6c, 0x20, 0xea, 0xa9, 0x63, 0x75, 0xbf, 0xfc, 0x36, 0x7a, 0xb0, 0xf3, 0x39, 0x6b, 0xa1, 0xe2, 0x28, 0x64, 0xae, 0xed, 0x27, 0x49, 0x83, 0xc0, 0xa, 0x46, 0x8c, 0xcf, 0x5, 0x57, 0x9d, 0xde, 0x14, 0x58, 0x92, 0xd1, 0x1b},
+ {0x0, 0xcb, 0x8b, 0x40, 0xb, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d, 0x2c, 0xe7, 0xa7, 0x6c, 0x27, 0xec, 0xac, 0x67, 0x3a, 0xf1, 0xb1, 0x7a, 0x31, 0xfa, 0xba, 0x71, 0x58, 0x93, 0xd3, 0x18, 0x53, 0x98, 0xd8, 0x13, 0x4e, 0x85, 0xc5, 0xe, 0x45, 0x8e, 0xce, 0x5, 0x74, 0xbf, 0xff, 0x34, 0x7f, 0xb4, 0xf4, 0x3f, 0x62, 0xa9, 0xe9, 0x22, 0x69, 0xa2, 0xe2, 0x29, 0xb0, 0x7b, 0x3b, 0xf0, 0xbb, 0x70, 0x30, 0xfb, 0xa6, 0x6d, 0x2d, 0xe6, 0xad, 0x66, 0x26, 0xed, 0x9c, 0x57, 0x17, 0xdc, 0x97, 0x5c, 0x1c, 0xd7, 0x8a, 0x41, 0x1, 0xca, 0x81, 0x4a, 0xa, 0xc1, 0xe8, 0x23, 0x63, 0xa8, 0xe3, 0x28, 0x68, 0xa3, 0xfe, 0x35, 0x75, 0xbe, 0xf5, 0x3e, 0x7e, 0xb5, 0xc4, 0xf, 0x4f, 0x84, 0xcf, 0x4, 0x44, 0x8f, 0xd2, 0x19, 0x59, 0x92, 0xd9, 0x12, 0x52, 0x99, 0x7d, 0xb6, 0xf6, 0x3d, 0x76, 0xbd, 0xfd, 0x36, 0x6b, 0xa0, 0xe0, 0x2b, 0x60, 0xab, 0xeb, 0x20, 0x51, 0x9a, 0xda, 0x11, 0x5a, 0x91, 0xd1, 0x1a, 0x47, 0x8c, 0xcc, 0x7, 0x4c, 0x87, 0xc7, 0xc, 0x25, 0xee, 0xae, 0x65, 0x2e, 0xe5, 0xa5, 0x6e, 0x33, 0xf8, 0xb8, 0x73, 0x38, 0xf3, 0xb3, 0x78, 0x9, 0xc2, 0x82, 0x49, 0x2, 0xc9, 0x89, 0x42, 0x1f, 0xd4, 0x94, 0x5f, 0x14, 0xdf, 0x9f, 0x54, 0xcd, 0x6, 0x46, 0x8d, 0xc6, 0xd, 0x4d, 0x86, 0xdb, 0x10, 0x50, 0x9b, 0xd0, 0x1b, 0x5b, 0x90, 0xe1, 0x2a, 0x6a, 0xa1, 0xea, 0x21, 0x61, 0xaa, 0xf7, 0x3c, 0x7c, 0xb7, 0xfc, 0x37, 0x77, 0xbc, 0x95, 0x5e, 0x1e, 0xd5, 0x9e, 0x55, 0x15, 0xde, 0x83, 0x48, 0x8, 0xc3, 0x88, 0x43, 0x3, 0xc8, 0xb9, 0x72, 0x32, 0xf9, 0xb2, 0x79, 0x39, 0xf2, 0xaf, 0x64, 0x24, 0xef, 0xa4, 0x6f, 0x2f, 0xe4},
+ {0x0, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70, 0x5c, 0x90, 0xd9, 0x15, 0x4b, 0x87, 0xce, 0x2, 0x72, 0xbe, 0xf7, 0x3b, 0x65, 0xa9, 0xe0, 0x2c, 0xb8, 0x74, 0x3d, 0xf1, 0xaf, 0x63, 0x2a, 0xe6, 0x96, 0x5a, 0x13, 0xdf, 0x81, 0x4d, 0x4, 0xc8, 0xe4, 0x28, 0x61, 0xad, 0xf3, 0x3f, 0x76, 0xba, 0xca, 0x6, 0x4f, 0x83, 0xdd, 0x11, 0x58, 0x94, 0x6d, 0xa1, 0xe8, 0x24, 0x7a, 0xb6, 0xff, 0x33, 0x43, 0x8f, 0xc6, 0xa, 0x54, 0x98, 0xd1, 0x1d, 0x31, 0xfd, 0xb4, 0x78, 0x26, 0xea, 0xa3, 0x6f, 0x1f, 0xd3, 0x9a, 0x56, 0x8, 0xc4, 0x8d, 0x41, 0xd5, 0x19, 0x50, 0x9c, 0xc2, 0xe, 0x47, 0x8b, 0xfb, 0x37, 0x7e, 0xb2, 0xec, 0x20, 0x69, 0xa5, 0x89, 0x45, 0xc, 0xc0, 0x9e, 0x52, 0x1b, 0xd7, 0xa7, 0x6b, 0x22, 0xee, 0xb0, 0x7c, 0x35, 0xf9, 0xda, 0x16, 0x5f, 0x93, 0xcd, 0x1, 0x48, 0x84, 0xf4, 0x38, 0x71, 0xbd, 0xe3, 0x2f, 0x66, 0xaa, 0x86, 0x4a, 0x3, 0xcf, 0x91, 0x5d, 0x14, 0xd8, 0xa8, 0x64, 0x2d, 0xe1, 0xbf, 0x73, 0x3a, 0xf6, 0x62, 0xae, 0xe7, 0x2b, 0x75, 0xb9, 0xf0, 0x3c, 0x4c, 0x80, 0xc9, 0x5, 0x5b, 0x97, 0xde, 0x12, 0x3e, 0xf2, 0xbb, 0x77, 0x29, 0xe5, 0xac, 0x60, 0x10, 0xdc, 0x95, 0x59, 0x7, 0xcb, 0x82, 0x4e, 0xb7, 0x7b, 0x32, 0xfe, 0xa0, 0x6c, 0x25, 0xe9, 0x99, 0x55, 0x1c, 0xd0, 0x8e, 0x42, 0xb, 0xc7, 0xeb, 0x27, 0x6e, 0xa2, 0xfc, 0x30, 0x79, 0xb5, 0xc5, 0x9, 0x40, 0x8c, 0xd2, 0x1e, 0x57, 0x9b, 0xf, 0xc3, 0x8a, 0x46, 0x18, 0xd4, 0x9d, 0x51, 0x21, 0xed, 0xa4, 0x68, 0x36, 0xfa, 0xb3, 0x7f, 0x53, 0x9f, 0xd6, 0x1a, 0x44, 0x88, 0xc1, 0xd, 0x7d, 0xb1, 0xf8, 0x34, 0x6a, 0xa6, 0xef, 0x23},
+ {0x0, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f, 0x4c, 0x81, 0xcb, 0x6, 0x5f, 0x92, 0xd8, 0x15, 0x6a, 0xa7, 0xed, 0x20, 0x79, 0xb4, 0xfe, 0x33, 0x98, 0x55, 0x1f, 0xd2, 0x8b, 0x46, 0xc, 0xc1, 0xbe, 0x73, 0x39, 0xf4, 0xad, 0x60, 0x2a, 0xe7, 0xd4, 0x19, 0x53, 0x9e, 0xc7, 0xa, 0x40, 0x8d, 0xf2, 0x3f, 0x75, 0xb8, 0xe1, 0x2c, 0x66, 0xab, 0x2d, 0xe0, 0xaa, 0x67, 0x3e, 0xf3, 0xb9, 0x74, 0xb, 0xc6, 0x8c, 0x41, 0x18, 0xd5, 0x9f, 0x52, 0x61, 0xac, 0xe6, 0x2b, 0x72, 0xbf, 0xf5, 0x38, 0x47, 0x8a, 0xc0, 0xd, 0x54, 0x99, 0xd3, 0x1e, 0xb5, 0x78, 0x32, 0xff, 0xa6, 0x6b, 0x21, 0xec, 0x93, 0x5e, 0x14, 0xd9, 0x80, 0x4d, 0x7, 0xca, 0xf9, 0x34, 0x7e, 0xb3, 0xea, 0x27, 0x6d, 0xa0, 0xdf, 0x12, 0x58, 0x95, 0xcc, 0x1, 0x4b, 0x86, 0x5a, 0x97, 0xdd, 0x10, 0x49, 0x84, 0xce, 0x3, 0x7c, 0xb1, 0xfb, 0x36, 0x6f, 0xa2, 0xe8, 0x25, 0x16, 0xdb, 0x91, 0x5c, 0x5, 0xc8, 0x82, 0x4f, 0x30, 0xfd, 0xb7, 0x7a, 0x23, 0xee, 0xa4, 0x69, 0xc2, 0xf, 0x45, 0x88, 0xd1, 0x1c, 0x56, 0x9b, 0xe4, 0x29, 0x63, 0xae, 0xf7, 0x3a, 0x70, 0xbd, 0x8e, 0x43, 0x9, 0xc4, 0x9d, 0x50, 0x1a, 0xd7, 0xa8, 0x65, 0x2f, 0xe2, 0xbb, 0x76, 0x3c, 0xf1, 0x77, 0xba, 0xf0, 0x3d, 0x64, 0xa9, 0xe3, 0x2e, 0x51, 0x9c, 0xd6, 0x1b, 0x42, 0x8f, 0xc5, 0x8, 0x3b, 0xf6, 0xbc, 0x71, 0x28, 0xe5, 0xaf, 0x62, 0x1d, 0xd0, 0x9a, 0x57, 0xe, 0xc3, 0x89, 0x44, 0xef, 0x22, 0x68, 0xa5, 0xfc, 0x31, 0x7b, 0xb6, 0xc9, 0x4, 0x4e, 0x83, 0xda, 0x17, 0x5d, 0x90, 0xa3, 0x6e, 0x24, 0xe9, 0xb0, 0x7d, 0x37, 0xfa, 0x85, 0x48, 0x2, 0xcf, 0x96, 0x5b, 0x11, 0xdc},
+ {0x0, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e, 0x7c, 0xb2, 0xfd, 0x33, 0x63, 0xad, 0xe2, 0x2c, 0x42, 0x8c, 0xc3, 0xd, 0x5d, 0x93, 0xdc, 0x12, 0xf8, 0x36, 0x79, 0xb7, 0xe7, 0x29, 0x66, 0xa8, 0xc6, 0x8, 0x47, 0x89, 0xd9, 0x17, 0x58, 0x96, 0x84, 0x4a, 0x5, 0xcb, 0x9b, 0x55, 0x1a, 0xd4, 0xba, 0x74, 0x3b, 0xf5, 0xa5, 0x6b, 0x24, 0xea, 0xed, 0x23, 0x6c, 0xa2, 0xf2, 0x3c, 0x73, 0xbd, 0xd3, 0x1d, 0x52, 0x9c, 0xcc, 0x2, 0x4d, 0x83, 0x91, 0x5f, 0x10, 0xde, 0x8e, 0x40, 0xf, 0xc1, 0xaf, 0x61, 0x2e, 0xe0, 0xb0, 0x7e, 0x31, 0xff, 0x15, 0xdb, 0x94, 0x5a, 0xa, 0xc4, 0x8b, 0x45, 0x2b, 0xe5, 0xaa, 0x64, 0x34, 0xfa, 0xb5, 0x7b, 0x69, 0xa7, 0xe8, 0x26, 0x76, 0xb8, 0xf7, 0x39, 0x57, 0x99, 0xd6, 0x18, 0x48, 0x86, 0xc9, 0x7, 0xc7, 0x9, 0x46, 0x88, 0xd8, 0x16, 0x59, 0x97, 0xf9, 0x37, 0x78, 0xb6, 0xe6, 0x28, 0x67, 0xa9, 0xbb, 0x75, 0x3a, 0xf4, 0xa4, 0x6a, 0x25, 0xeb, 0x85, 0x4b, 0x4, 0xca, 0x9a, 0x54, 0x1b, 0xd5, 0x3f, 0xf1, 0xbe, 0x70, 0x20, 0xee, 0xa1, 0x6f, 0x1, 0xcf, 0x80, 0x4e, 0x1e, 0xd0, 0x9f, 0x51, 0x43, 0x8d, 0xc2, 0xc, 0x5c, 0x92, 0xdd, 0x13, 0x7d, 0xb3, 0xfc, 0x32, 0x62, 0xac, 0xe3, 0x2d, 0x2a, 0xe4, 0xab, 0x65, 0x35, 0xfb, 0xb4, 0x7a, 0x14, 0xda, 0x95, 0x5b, 0xb, 0xc5, 0x8a, 0x44, 0x56, 0x98, 0xd7, 0x19, 0x49, 0x87, 0xc8, 0x6, 0x68, 0xa6, 0xe9, 0x27, 0x77, 0xb9, 0xf6, 0x38, 0xd2, 0x1c, 0x53, 0x9d, 0xcd, 0x3, 0x4c, 0x82, 0xec, 0x22, 0x6d, 0xa3, 0xf3, 0x3d, 0x72, 0xbc, 0xae, 0x60, 0x2f, 0xe1, 0xb1, 0x7f, 0x30, 0xfe, 0x90, 0x5e, 0x11, 0xdf, 0x8f, 0x41, 0xe, 0xc0},
+ {0x0, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61, 0x6c, 0xa3, 0xef, 0x20, 0x77, 0xb8, 0xf4, 0x3b, 0x5a, 0x95, 0xd9, 0x16, 0x41, 0x8e, 0xc2, 0xd, 0xd8, 0x17, 0x5b, 0x94, 0xc3, 0xc, 0x40, 0x8f, 0xee, 0x21, 0x6d, 0xa2, 0xf5, 0x3a, 0x76, 0xb9, 0xb4, 0x7b, 0x37, 0xf8, 0xaf, 0x60, 0x2c, 0xe3, 0x82, 0x4d, 0x1, 0xce, 0x99, 0x56, 0x1a, 0xd5, 0xad, 0x62, 0x2e, 0xe1, 0xb6, 0x79, 0x35, 0xfa, 0x9b, 0x54, 0x18, 0xd7, 0x80, 0x4f, 0x3, 0xcc, 0xc1, 0xe, 0x42, 0x8d, 0xda, 0x15, 0x59, 0x96, 0xf7, 0x38, 0x74, 0xbb, 0xec, 0x23, 0x6f, 0xa0, 0x75, 0xba, 0xf6, 0x39, 0x6e, 0xa1, 0xed, 0x22, 0x43, 0x8c, 0xc0, 0xf, 0x58, 0x97, 0xdb, 0x14, 0x19, 0xd6, 0x9a, 0x55, 0x2, 0xcd, 0x81, 0x4e, 0x2f, 0xe0, 0xac, 0x63, 0x34, 0xfb, 0xb7, 0x78, 0x47, 0x88, 0xc4, 0xb, 0x5c, 0x93, 0xdf, 0x10, 0x71, 0xbe, 0xf2, 0x3d, 0x6a, 0xa5, 0xe9, 0x26, 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c, 0x1d, 0xd2, 0x9e, 0x51, 0x6, 0xc9, 0x85, 0x4a, 0x9f, 0x50, 0x1c, 0xd3, 0x84, 0x4b, 0x7, 0xc8, 0xa9, 0x66, 0x2a, 0xe5, 0xb2, 0x7d, 0x31, 0xfe, 0xf3, 0x3c, 0x70, 0xbf, 0xe8, 0x27, 0x6b, 0xa4, 0xc5, 0xa, 0x46, 0x89, 0xde, 0x11, 0x5d, 0x92, 0xea, 0x25, 0x69, 0xa6, 0xf1, 0x3e, 0x72, 0xbd, 0xdc, 0x13, 0x5f, 0x90, 0xc7, 0x8, 0x44, 0x8b, 0x86, 0x49, 0x5, 0xca, 0x9d, 0x52, 0x1e, 0xd1, 0xb0, 0x7f, 0x33, 0xfc, 0xab, 0x64, 0x28, 0xe7, 0x32, 0xfd, 0xb1, 0x7e, 0x29, 0xe6, 0xaa, 0x65, 0x4, 0xcb, 0x87, 0x48, 0x1f, 0xd0, 0x9c, 0x53, 0x5e, 0x91, 0xdd, 0x12, 0x45, 0x8a, 0xc6, 0x9, 0x68, 0xa7, 0xeb, 0x24, 0x73, 0xbc, 0xf0, 0x3f},
+ {0x0, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0xa, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4, 0x81, 0x51, 0x3c, 0xec, 0xe6, 0x36, 0x5b, 0x8b, 0x4f, 0x9f, 0xf2, 0x22, 0x28, 0xf8, 0x95, 0x45, 0x1f, 0xcf, 0xa2, 0x72, 0x78, 0xa8, 0xc5, 0x15, 0xd1, 0x1, 0x6c, 0xbc, 0xb6, 0x66, 0xb, 0xdb, 0x9e, 0x4e, 0x23, 0xf3, 0xf9, 0x29, 0x44, 0x94, 0x50, 0x80, 0xed, 0x3d, 0x37, 0xe7, 0x8a, 0x5a, 0x3e, 0xee, 0x83, 0x53, 0x59, 0x89, 0xe4, 0x34, 0xf0, 0x20, 0x4d, 0x9d, 0x97, 0x47, 0x2a, 0xfa, 0xbf, 0x6f, 0x2, 0xd2, 0xd8, 0x8, 0x65, 0xb5, 0x71, 0xa1, 0xcc, 0x1c, 0x16, 0xc6, 0xab, 0x7b, 0x21, 0xf1, 0x9c, 0x4c, 0x46, 0x96, 0xfb, 0x2b, 0xef, 0x3f, 0x52, 0x82, 0x88, 0x58, 0x35, 0xe5, 0xa0, 0x70, 0x1d, 0xcd, 0xc7, 0x17, 0x7a, 0xaa, 0x6e, 0xbe, 0xd3, 0x3, 0x9, 0xd9, 0xb4, 0x64, 0x7c, 0xac, 0xc1, 0x11, 0x1b, 0xcb, 0xa6, 0x76, 0xb2, 0x62, 0xf, 0xdf, 0xd5, 0x5, 0x68, 0xb8, 0xfd, 0x2d, 0x40, 0x90, 0x9a, 0x4a, 0x27, 0xf7, 0x33, 0xe3, 0x8e, 0x5e, 0x54, 0x84, 0xe9, 0x39, 0x63, 0xb3, 0xde, 0xe, 0x4, 0xd4, 0xb9, 0x69, 0xad, 0x7d, 0x10, 0xc0, 0xca, 0x1a, 0x77, 0xa7, 0xe2, 0x32, 0x5f, 0x8f, 0x85, 0x55, 0x38, 0xe8, 0x2c, 0xfc, 0x91, 0x41, 0x4b, 0x9b, 0xf6, 0x26, 0x42, 0x92, 0xff, 0x2f, 0x25, 0xf5, 0x98, 0x48, 0x8c, 0x5c, 0x31, 0xe1, 0xeb, 0x3b, 0x56, 0x86, 0xc3, 0x13, 0x7e, 0xae, 0xa4, 0x74, 0x19, 0xc9, 0xd, 0xdd, 0xb0, 0x60, 0x6a, 0xba, 0xd7, 0x7, 0x5d, 0x8d, 0xe0, 0x30, 0x3a, 0xea, 0x87, 0x57, 0x93, 0x43, 0x2e, 0xfe, 0xf4, 0x24, 0x49, 0x99, 0xdc, 0xc, 0x61, 0xb1, 0xbb, 0x6b, 0x6, 0xd6, 0x12, 0xc2, 0xaf, 0x7f, 0x75, 0xa5, 0xc8, 0x18},
+ {0x0, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0xd, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb, 0x91, 0x40, 0x2e, 0xff, 0xf2, 0x23, 0x4d, 0x9c, 0x57, 0x86, 0xe8, 0x39, 0x34, 0xe5, 0x8b, 0x5a, 0x3f, 0xee, 0x80, 0x51, 0x5c, 0x8d, 0xe3, 0x32, 0xf9, 0x28, 0x46, 0x97, 0x9a, 0x4b, 0x25, 0xf4, 0xae, 0x7f, 0x11, 0xc0, 0xcd, 0x1c, 0x72, 0xa3, 0x68, 0xb9, 0xd7, 0x6, 0xb, 0xda, 0xb4, 0x65, 0x7e, 0xaf, 0xc1, 0x10, 0x1d, 0xcc, 0xa2, 0x73, 0xb8, 0x69, 0x7, 0xd6, 0xdb, 0xa, 0x64, 0xb5, 0xef, 0x3e, 0x50, 0x81, 0x8c, 0x5d, 0x33, 0xe2, 0x29, 0xf8, 0x96, 0x47, 0x4a, 0x9b, 0xf5, 0x24, 0x41, 0x90, 0xfe, 0x2f, 0x22, 0xf3, 0x9d, 0x4c, 0x87, 0x56, 0x38, 0xe9, 0xe4, 0x35, 0x5b, 0x8a, 0xd0, 0x1, 0x6f, 0xbe, 0xb3, 0x62, 0xc, 0xdd, 0x16, 0xc7, 0xa9, 0x78, 0x75, 0xa4, 0xca, 0x1b, 0xfc, 0x2d, 0x43, 0x92, 0x9f, 0x4e, 0x20, 0xf1, 0x3a, 0xeb, 0x85, 0x54, 0x59, 0x88, 0xe6, 0x37, 0x6d, 0xbc, 0xd2, 0x3, 0xe, 0xdf, 0xb1, 0x60, 0xab, 0x7a, 0x14, 0xc5, 0xc8, 0x19, 0x77, 0xa6, 0xc3, 0x12, 0x7c, 0xad, 0xa0, 0x71, 0x1f, 0xce, 0x5, 0xd4, 0xba, 0x6b, 0x66, 0xb7, 0xd9, 0x8, 0x52, 0x83, 0xed, 0x3c, 0x31, 0xe0, 0x8e, 0x5f, 0x94, 0x45, 0x2b, 0xfa, 0xf7, 0x26, 0x48, 0x99, 0x82, 0x53, 0x3d, 0xec, 0xe1, 0x30, 0x5e, 0x8f, 0x44, 0x95, 0xfb, 0x2a, 0x27, 0xf6, 0x98, 0x49, 0x13, 0xc2, 0xac, 0x7d, 0x70, 0xa1, 0xcf, 0x1e, 0xd5, 0x4, 0x6a, 0xbb, 0xb6, 0x67, 0x9, 0xd8, 0xbd, 0x6c, 0x2, 0xd3, 0xde, 0xf, 0x61, 0xb0, 0x7b, 0xaa, 0xc4, 0x15, 0x18, 0xc9, 0xa7, 0x76, 0x2c, 0xfd, 0x93, 0x42, 0x4f, 0x9e, 0xf0, 0x21, 0xea, 0x3b, 0x55, 0x84, 0x89, 0x58, 0x36, 0xe7},
+ {0x0, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x4, 0xde, 0xc, 0x67, 0xb5, 0xb1, 0x63, 0x8, 0xda, 0xa1, 0x73, 0x18, 0xca, 0xce, 0x1c, 0x77, 0xa5, 0x7f, 0xad, 0xc6, 0x14, 0x10, 0xc2, 0xa9, 0x7b, 0x5f, 0x8d, 0xe6, 0x34, 0x30, 0xe2, 0x89, 0x5b, 0x81, 0x53, 0x38, 0xea, 0xee, 0x3c, 0x57, 0x85, 0xfe, 0x2c, 0x47, 0x95, 0x91, 0x43, 0x28, 0xfa, 0x20, 0xf2, 0x99, 0x4b, 0x4f, 0x9d, 0xf6, 0x24, 0xbe, 0x6c, 0x7, 0xd5, 0xd1, 0x3, 0x68, 0xba, 0x60, 0xb2, 0xd9, 0xb, 0xf, 0xdd, 0xb6, 0x64, 0x1f, 0xcd, 0xa6, 0x74, 0x70, 0xa2, 0xc9, 0x1b, 0xc1, 0x13, 0x78, 0xaa, 0xae, 0x7c, 0x17, 0xc5, 0xe1, 0x33, 0x58, 0x8a, 0x8e, 0x5c, 0x37, 0xe5, 0x3f, 0xed, 0x86, 0x54, 0x50, 0x82, 0xe9, 0x3b, 0x40, 0x92, 0xf9, 0x2b, 0x2f, 0xfd, 0x96, 0x44, 0x9e, 0x4c, 0x27, 0xf5, 0xf1, 0x23, 0x48, 0x9a, 0x61, 0xb3, 0xd8, 0xa, 0xe, 0xdc, 0xb7, 0x65, 0xbf, 0x6d, 0x6, 0xd4, 0xd0, 0x2, 0x69, 0xbb, 0xc0, 0x12, 0x79, 0xab, 0xaf, 0x7d, 0x16, 0xc4, 0x1e, 0xcc, 0xa7, 0x75, 0x71, 0xa3, 0xc8, 0x1a, 0x3e, 0xec, 0x87, 0x55, 0x51, 0x83, 0xe8, 0x3a, 0xe0, 0x32, 0x59, 0x8b, 0x8f, 0x5d, 0x36, 0xe4, 0x9f, 0x4d, 0x26, 0xf4, 0xf0, 0x22, 0x49, 0x9b, 0x41, 0x93, 0xf8, 0x2a, 0x2e, 0xfc, 0x97, 0x45, 0xdf, 0xd, 0x66, 0xb4, 0xb0, 0x62, 0x9, 0xdb, 0x1, 0xd3, 0xb8, 0x6a, 0x6e, 0xbc, 0xd7, 0x5, 0x7e, 0xac, 0xc7, 0x15, 0x11, 0xc3, 0xa8, 0x7a, 0xa0, 0x72, 0x19, 0xcb, 0xcf, 0x1d, 0x76, 0xa4, 0x80, 0x52, 0x39, 0xeb, 0xef, 0x3d, 0x56, 0x84, 0x5e, 0x8c, 0xe7, 0x35, 0x31, 0xe3, 0x88, 0x5a, 0x21, 0xf3, 0x98, 0x4a, 0x4e, 0x9c, 0xf7, 0x25, 0xff, 0x2d, 0x46, 0x94, 0x90, 0x42, 0x29, 0xfb},
+ {0x0, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x3, 0xd6, 0x5, 0x6d, 0xbe, 0xbd, 0x6e, 0x6, 0xd5, 0xb1, 0x62, 0xa, 0xd9, 0xda, 0x9, 0x61, 0xb2, 0x67, 0xb4, 0xdc, 0xf, 0xc, 0xdf, 0xb7, 0x64, 0x7f, 0xac, 0xc4, 0x17, 0x14, 0xc7, 0xaf, 0x7c, 0xa9, 0x7a, 0x12, 0xc1, 0xc2, 0x11, 0x79, 0xaa, 0xce, 0x1d, 0x75, 0xa6, 0xa5, 0x76, 0x1e, 0xcd, 0x18, 0xcb, 0xa3, 0x70, 0x73, 0xa0, 0xc8, 0x1b, 0xfe, 0x2d, 0x45, 0x96, 0x95, 0x46, 0x2e, 0xfd, 0x28, 0xfb, 0x93, 0x40, 0x43, 0x90, 0xf8, 0x2b, 0x4f, 0x9c, 0xf4, 0x27, 0x24, 0xf7, 0x9f, 0x4c, 0x99, 0x4a, 0x22, 0xf1, 0xf2, 0x21, 0x49, 0x9a, 0x81, 0x52, 0x3a, 0xe9, 0xea, 0x39, 0x51, 0x82, 0x57, 0x84, 0xec, 0x3f, 0x3c, 0xef, 0x87, 0x54, 0x30, 0xe3, 0x8b, 0x58, 0x5b, 0x88, 0xe0, 0x33, 0xe6, 0x35, 0x5d, 0x8e, 0x8d, 0x5e, 0x36, 0xe5, 0xe1, 0x32, 0x5a, 0x89, 0x8a, 0x59, 0x31, 0xe2, 0x37, 0xe4, 0x8c, 0x5f, 0x5c, 0x8f, 0xe7, 0x34, 0x50, 0x83, 0xeb, 0x38, 0x3b, 0xe8, 0x80, 0x53, 0x86, 0x55, 0x3d, 0xee, 0xed, 0x3e, 0x56, 0x85, 0x9e, 0x4d, 0x25, 0xf6, 0xf5, 0x26, 0x4e, 0x9d, 0x48, 0x9b, 0xf3, 0x20, 0x23, 0xf0, 0x98, 0x4b, 0x2f, 0xfc, 0x94, 0x47, 0x44, 0x97, 0xff, 0x2c, 0xf9, 0x2a, 0x42, 0x91, 0x92, 0x41, 0x29, 0xfa, 0x1f, 0xcc, 0xa4, 0x77, 0x74, 0xa7, 0xcf, 0x1c, 0xc9, 0x1a, 0x72, 0xa1, 0xa2, 0x71, 0x19, 0xca, 0xae, 0x7d, 0x15, 0xc6, 0xc5, 0x16, 0x7e, 0xad, 0x78, 0xab, 0xc3, 0x10, 0x13, 0xc0, 0xa8, 0x7b, 0x60, 0xb3, 0xdb, 0x8, 0xb, 0xd8, 0xb0, 0x63, 0xb6, 0x65, 0xd, 0xde, 0xdd, 0xe, 0x66, 0xb5, 0xd1, 0x2, 0x6a, 0xb9, 0xba, 0x69, 0x1, 0xd2, 0x7, 0xd4, 0xbc, 0x6f, 0x6c, 0xbf, 0xd7, 0x4},
+ {0x0, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8, 0xc1, 0x15, 0x74, 0xa0, 0xb6, 0x62, 0x3, 0xd7, 0x2f, 0xfb, 0x9a, 0x4e, 0x58, 0x8c, 0xed, 0x39, 0x9f, 0x4b, 0x2a, 0xfe, 0xe8, 0x3c, 0x5d, 0x89, 0x71, 0xa5, 0xc4, 0x10, 0x6, 0xd2, 0xb3, 0x67, 0x5e, 0x8a, 0xeb, 0x3f, 0x29, 0xfd, 0x9c, 0x48, 0xb0, 0x64, 0x5, 0xd1, 0xc7, 0x13, 0x72, 0xa6, 0x23, 0xf7, 0x96, 0x42, 0x54, 0x80, 0xe1, 0x35, 0xcd, 0x19, 0x78, 0xac, 0xba, 0x6e, 0xf, 0xdb, 0xe2, 0x36, 0x57, 0x83, 0x95, 0x41, 0x20, 0xf4, 0xc, 0xd8, 0xb9, 0x6d, 0x7b, 0xaf, 0xce, 0x1a, 0xbc, 0x68, 0x9, 0xdd, 0xcb, 0x1f, 0x7e, 0xaa, 0x52, 0x86, 0xe7, 0x33, 0x25, 0xf1, 0x90, 0x44, 0x7d, 0xa9, 0xc8, 0x1c, 0xa, 0xde, 0xbf, 0x6b, 0x93, 0x47, 0x26, 0xf2, 0xe4, 0x30, 0x51, 0x85, 0x46, 0x92, 0xf3, 0x27, 0x31, 0xe5, 0x84, 0x50, 0xa8, 0x7c, 0x1d, 0xc9, 0xdf, 0xb, 0x6a, 0xbe, 0x87, 0x53, 0x32, 0xe6, 0xf0, 0x24, 0x45, 0x91, 0x69, 0xbd, 0xdc, 0x8, 0x1e, 0xca, 0xab, 0x7f, 0xd9, 0xd, 0x6c, 0xb8, 0xae, 0x7a, 0x1b, 0xcf, 0x37, 0xe3, 0x82, 0x56, 0x40, 0x94, 0xf5, 0x21, 0x18, 0xcc, 0xad, 0x79, 0x6f, 0xbb, 0xda, 0xe, 0xf6, 0x22, 0x43, 0x97, 0x81, 0x55, 0x34, 0xe0, 0x65, 0xb1, 0xd0, 0x4, 0x12, 0xc6, 0xa7, 0x73, 0x8b, 0x5f, 0x3e, 0xea, 0xfc, 0x28, 0x49, 0x9d, 0xa4, 0x70, 0x11, 0xc5, 0xd3, 0x7, 0x66, 0xb2, 0x4a, 0x9e, 0xff, 0x2b, 0x3d, 0xe9, 0x88, 0x5c, 0xfa, 0x2e, 0x4f, 0x9b, 0x8d, 0x59, 0x38, 0xec, 0x14, 0xc0, 0xa1, 0x75, 0x63, 0xb7, 0xd6, 0x2, 0x3b, 0xef, 0x8e, 0x5a, 0x4c, 0x98, 0xf9, 0x2d, 0xd5, 0x1, 0x60, 0xb4, 0xa2, 0x76, 0x17, 0xc3},
+ {0x0, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7, 0xd1, 0x4, 0x66, 0xb3, 0xa2, 0x77, 0x15, 0xc0, 0x37, 0xe2, 0x80, 0x55, 0x44, 0x91, 0xf3, 0x26, 0xbf, 0x6a, 0x8, 0xdd, 0xcc, 0x19, 0x7b, 0xae, 0x59, 0x8c, 0xee, 0x3b, 0x2a, 0xff, 0x9d, 0x48, 0x6e, 0xbb, 0xd9, 0xc, 0x1d, 0xc8, 0xaa, 0x7f, 0x88, 0x5d, 0x3f, 0xea, 0xfb, 0x2e, 0x4c, 0x99, 0x63, 0xb6, 0xd4, 0x1, 0x10, 0xc5, 0xa7, 0x72, 0x85, 0x50, 0x32, 0xe7, 0xf6, 0x23, 0x41, 0x94, 0xb2, 0x67, 0x5, 0xd0, 0xc1, 0x14, 0x76, 0xa3, 0x54, 0x81, 0xe3, 0x36, 0x27, 0xf2, 0x90, 0x45, 0xdc, 0x9, 0x6b, 0xbe, 0xaf, 0x7a, 0x18, 0xcd, 0x3a, 0xef, 0x8d, 0x58, 0x49, 0x9c, 0xfe, 0x2b, 0xd, 0xd8, 0xba, 0x6f, 0x7e, 0xab, 0xc9, 0x1c, 0xeb, 0x3e, 0x5c, 0x89, 0x98, 0x4d, 0x2f, 0xfa, 0xc6, 0x13, 0x71, 0xa4, 0xb5, 0x60, 0x2, 0xd7, 0x20, 0xf5, 0x97, 0x42, 0x53, 0x86, 0xe4, 0x31, 0x17, 0xc2, 0xa0, 0x75, 0x64, 0xb1, 0xd3, 0x6, 0xf1, 0x24, 0x46, 0x93, 0x82, 0x57, 0x35, 0xe0, 0x79, 0xac, 0xce, 0x1b, 0xa, 0xdf, 0xbd, 0x68, 0x9f, 0x4a, 0x28, 0xfd, 0xec, 0x39, 0x5b, 0x8e, 0xa8, 0x7d, 0x1f, 0xca, 0xdb, 0xe, 0x6c, 0xb9, 0x4e, 0x9b, 0xf9, 0x2c, 0x3d, 0xe8, 0x8a, 0x5f, 0xa5, 0x70, 0x12, 0xc7, 0xd6, 0x3, 0x61, 0xb4, 0x43, 0x96, 0xf4, 0x21, 0x30, 0xe5, 0x87, 0x52, 0x74, 0xa1, 0xc3, 0x16, 0x7, 0xd2, 0xb0, 0x65, 0x92, 0x47, 0x25, 0xf0, 0xe1, 0x34, 0x56, 0x83, 0x1a, 0xcf, 0xad, 0x78, 0x69, 0xbc, 0xde, 0xb, 0xfc, 0x29, 0x4b, 0x9e, 0x8f, 0x5a, 0x38, 0xed, 0xcb, 0x1e, 0x7c, 0xa9, 0xb8, 0x6d, 0xf, 0xda, 0x2d, 0xf8, 0x9a, 0x4f, 0x5e, 0x8b, 0xe9, 0x3c},
+ {0x0, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6, 0xe1, 0x37, 0x50, 0x86, 0x9e, 0x48, 0x2f, 0xf9, 0x1f, 0xc9, 0xae, 0x78, 0x60, 0xb6, 0xd1, 0x7, 0xdf, 0x9, 0x6e, 0xb8, 0xa0, 0x76, 0x11, 0xc7, 0x21, 0xf7, 0x90, 0x46, 0x5e, 0x88, 0xef, 0x39, 0x3e, 0xe8, 0x8f, 0x59, 0x41, 0x97, 0xf0, 0x26, 0xc0, 0x16, 0x71, 0xa7, 0xbf, 0x69, 0xe, 0xd8, 0xa3, 0x75, 0x12, 0xc4, 0xdc, 0xa, 0x6d, 0xbb, 0x5d, 0x8b, 0xec, 0x3a, 0x22, 0xf4, 0x93, 0x45, 0x42, 0x94, 0xf3, 0x25, 0x3d, 0xeb, 0x8c, 0x5a, 0xbc, 0x6a, 0xd, 0xdb, 0xc3, 0x15, 0x72, 0xa4, 0x7c, 0xaa, 0xcd, 0x1b, 0x3, 0xd5, 0xb2, 0x64, 0x82, 0x54, 0x33, 0xe5, 0xfd, 0x2b, 0x4c, 0x9a, 0x9d, 0x4b, 0x2c, 0xfa, 0xe2, 0x34, 0x53, 0x85, 0x63, 0xb5, 0xd2, 0x4, 0x1c, 0xca, 0xad, 0x7b, 0x5b, 0x8d, 0xea, 0x3c, 0x24, 0xf2, 0x95, 0x43, 0xa5, 0x73, 0x14, 0xc2, 0xda, 0xc, 0x6b, 0xbd, 0xba, 0x6c, 0xb, 0xdd, 0xc5, 0x13, 0x74, 0xa2, 0x44, 0x92, 0xf5, 0x23, 0x3b, 0xed, 0x8a, 0x5c, 0x84, 0x52, 0x35, 0xe3, 0xfb, 0x2d, 0x4a, 0x9c, 0x7a, 0xac, 0xcb, 0x1d, 0x5, 0xd3, 0xb4, 0x62, 0x65, 0xb3, 0xd4, 0x2, 0x1a, 0xcc, 0xab, 0x7d, 0x9b, 0x4d, 0x2a, 0xfc, 0xe4, 0x32, 0x55, 0x83, 0xf8, 0x2e, 0x49, 0x9f, 0x87, 0x51, 0x36, 0xe0, 0x6, 0xd0, 0xb7, 0x61, 0x79, 0xaf, 0xc8, 0x1e, 0x19, 0xcf, 0xa8, 0x7e, 0x66, 0xb0, 0xd7, 0x1, 0xe7, 0x31, 0x56, 0x80, 0x98, 0x4e, 0x29, 0xff, 0x27, 0xf1, 0x96, 0x40, 0x58, 0x8e, 0xe9, 0x3f, 0xd9, 0xf, 0x68, 0xbe, 0xa6, 0x70, 0x17, 0xc1, 0xc6, 0x10, 0x77, 0xa1, 0xb9, 0x6f, 0x8, 0xde, 0x38, 0xee, 0x89, 0x5f, 0x47, 0x91, 0xf6, 0x20},
+ {0x0, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9, 0xf1, 0x26, 0x42, 0x95, 0x8a, 0x5d, 0x39, 0xee, 0x7, 0xd0, 0xb4, 0x63, 0x7c, 0xab, 0xcf, 0x18, 0xff, 0x28, 0x4c, 0x9b, 0x84, 0x53, 0x37, 0xe0, 0x9, 0xde, 0xba, 0x6d, 0x72, 0xa5, 0xc1, 0x16, 0xe, 0xd9, 0xbd, 0x6a, 0x75, 0xa2, 0xc6, 0x11, 0xf8, 0x2f, 0x4b, 0x9c, 0x83, 0x54, 0x30, 0xe7, 0xe3, 0x34, 0x50, 0x87, 0x98, 0x4f, 0x2b, 0xfc, 0x15, 0xc2, 0xa6, 0x71, 0x6e, 0xb9, 0xdd, 0xa, 0x12, 0xc5, 0xa1, 0x76, 0x69, 0xbe, 0xda, 0xd, 0xe4, 0x33, 0x57, 0x80, 0x9f, 0x48, 0x2c, 0xfb, 0x1c, 0xcb, 0xaf, 0x78, 0x67, 0xb0, 0xd4, 0x3, 0xea, 0x3d, 0x59, 0x8e, 0x91, 0x46, 0x22, 0xf5, 0xed, 0x3a, 0x5e, 0x89, 0x96, 0x41, 0x25, 0xf2, 0x1b, 0xcc, 0xa8, 0x7f, 0x60, 0xb7, 0xd3, 0x4, 0xdb, 0xc, 0x68, 0xbf, 0xa0, 0x77, 0x13, 0xc4, 0x2d, 0xfa, 0x9e, 0x49, 0x56, 0x81, 0xe5, 0x32, 0x2a, 0xfd, 0x99, 0x4e, 0x51, 0x86, 0xe2, 0x35, 0xdc, 0xb, 0x6f, 0xb8, 0xa7, 0x70, 0x14, 0xc3, 0x24, 0xf3, 0x97, 0x40, 0x5f, 0x88, 0xec, 0x3b, 0xd2, 0x5, 0x61, 0xb6, 0xa9, 0x7e, 0x1a, 0xcd, 0xd5, 0x2, 0x66, 0xb1, 0xae, 0x79, 0x1d, 0xca, 0x23, 0xf4, 0x90, 0x47, 0x58, 0x8f, 0xeb, 0x3c, 0x38, 0xef, 0x8b, 0x5c, 0x43, 0x94, 0xf0, 0x27, 0xce, 0x19, 0x7d, 0xaa, 0xb5, 0x62, 0x6, 0xd1, 0xc9, 0x1e, 0x7a, 0xad, 0xb2, 0x65, 0x1, 0xd6, 0x3f, 0xe8, 0x8c, 0x5b, 0x44, 0x93, 0xf7, 0x20, 0xc7, 0x10, 0x74, 0xa3, 0xbc, 0x6b, 0xf, 0xd8, 0x31, 0xe6, 0x82, 0x55, 0x4a, 0x9d, 0xf9, 0x2e, 0x36, 0xe1, 0x85, 0x52, 0x4d, 0x9a, 0xfe, 0x29, 0xc0, 0x17, 0x73, 0xa4, 0xbb, 0x6c, 0x8, 0xdf},
+ {0x0, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc, 0x1, 0xd9, 0xac, 0x74, 0x46, 0x9e, 0xeb, 0x33, 0x8f, 0x57, 0x22, 0xfa, 0xc8, 0x10, 0x65, 0xbd, 0x2, 0xda, 0xaf, 0x77, 0x45, 0x9d, 0xe8, 0x30, 0x8c, 0x54, 0x21, 0xf9, 0xcb, 0x13, 0x66, 0xbe, 0x3, 0xdb, 0xae, 0x76, 0x44, 0x9c, 0xe9, 0x31, 0x8d, 0x55, 0x20, 0xf8, 0xca, 0x12, 0x67, 0xbf, 0x4, 0xdc, 0xa9, 0x71, 0x43, 0x9b, 0xee, 0x36, 0x8a, 0x52, 0x27, 0xff, 0xcd, 0x15, 0x60, 0xb8, 0x5, 0xdd, 0xa8, 0x70, 0x42, 0x9a, 0xef, 0x37, 0x8b, 0x53, 0x26, 0xfe, 0xcc, 0x14, 0x61, 0xb9, 0x6, 0xde, 0xab, 0x73, 0x41, 0x99, 0xec, 0x34, 0x88, 0x50, 0x25, 0xfd, 0xcf, 0x17, 0x62, 0xba, 0x7, 0xdf, 0xaa, 0x72, 0x40, 0x98, 0xed, 0x35, 0x89, 0x51, 0x24, 0xfc, 0xce, 0x16, 0x63, 0xbb, 0x8, 0xd0, 0xa5, 0x7d, 0x4f, 0x97, 0xe2, 0x3a, 0x86, 0x5e, 0x2b, 0xf3, 0xc1, 0x19, 0x6c, 0xb4, 0x9, 0xd1, 0xa4, 0x7c, 0x4e, 0x96, 0xe3, 0x3b, 0x87, 0x5f, 0x2a, 0xf2, 0xc0, 0x18, 0x6d, 0xb5, 0xa, 0xd2, 0xa7, 0x7f, 0x4d, 0x95, 0xe0, 0x38, 0x84, 0x5c, 0x29, 0xf1, 0xc3, 0x1b, 0x6e, 0xb6, 0xb, 0xd3, 0xa6, 0x7e, 0x4c, 0x94, 0xe1, 0x39, 0x85, 0x5d, 0x28, 0xf0, 0xc2, 0x1a, 0x6f, 0xb7, 0xc, 0xd4, 0xa1, 0x79, 0x4b, 0x93, 0xe6, 0x3e, 0x82, 0x5a, 0x2f, 0xf7, 0xc5, 0x1d, 0x68, 0xb0, 0xd, 0xd5, 0xa0, 0x78, 0x4a, 0x92, 0xe7, 0x3f, 0x83, 0x5b, 0x2e, 0xf6, 0xc4, 0x1c, 0x69, 0xb1, 0xe, 0xd6, 0xa3, 0x7b, 0x49, 0x91, 0xe4, 0x3c, 0x80, 0x58, 0x2d, 0xf5, 0xc7, 0x1f, 0x6a, 0xb2, 0xf, 0xd7, 0xa2, 0x7a, 0x48, 0x90, 0xe5, 0x3d, 0x81, 0x59, 0x2c, 0xf4, 0xc6, 0x1e, 0x6b, 0xb3},
+ {0x0, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3, 0x11, 0xc8, 0xbe, 0x67, 0x52, 0x8b, 0xfd, 0x24, 0x97, 0x4e, 0x38, 0xe1, 0xd4, 0xd, 0x7b, 0xa2, 0x22, 0xfb, 0x8d, 0x54, 0x61, 0xb8, 0xce, 0x17, 0xa4, 0x7d, 0xb, 0xd2, 0xe7, 0x3e, 0x48, 0x91, 0x33, 0xea, 0x9c, 0x45, 0x70, 0xa9, 0xdf, 0x6, 0xb5, 0x6c, 0x1a, 0xc3, 0xf6, 0x2f, 0x59, 0x80, 0x44, 0x9d, 0xeb, 0x32, 0x7, 0xde, 0xa8, 0x71, 0xc2, 0x1b, 0x6d, 0xb4, 0x81, 0x58, 0x2e, 0xf7, 0x55, 0x8c, 0xfa, 0x23, 0x16, 0xcf, 0xb9, 0x60, 0xd3, 0xa, 0x7c, 0xa5, 0x90, 0x49, 0x3f, 0xe6, 0x66, 0xbf, 0xc9, 0x10, 0x25, 0xfc, 0x8a, 0x53, 0xe0, 0x39, 0x4f, 0x96, 0xa3, 0x7a, 0xc, 0xd5, 0x77, 0xae, 0xd8, 0x1, 0x34, 0xed, 0x9b, 0x42, 0xf1, 0x28, 0x5e, 0x87, 0xb2, 0x6b, 0x1d, 0xc4, 0x88, 0x51, 0x27, 0xfe, 0xcb, 0x12, 0x64, 0xbd, 0xe, 0xd7, 0xa1, 0x78, 0x4d, 0x94, 0xe2, 0x3b, 0x99, 0x40, 0x36, 0xef, 0xda, 0x3, 0x75, 0xac, 0x1f, 0xc6, 0xb0, 0x69, 0x5c, 0x85, 0xf3, 0x2a, 0xaa, 0x73, 0x5, 0xdc, 0xe9, 0x30, 0x46, 0x9f, 0x2c, 0xf5, 0x83, 0x5a, 0x6f, 0xb6, 0xc0, 0x19, 0xbb, 0x62, 0x14, 0xcd, 0xf8, 0x21, 0x57, 0x8e, 0x3d, 0xe4, 0x92, 0x4b, 0x7e, 0xa7, 0xd1, 0x8, 0xcc, 0x15, 0x63, 0xba, 0x8f, 0x56, 0x20, 0xf9, 0x4a, 0x93, 0xe5, 0x3c, 0x9, 0xd0, 0xa6, 0x7f, 0xdd, 0x4, 0x72, 0xab, 0x9e, 0x47, 0x31, 0xe8, 0x5b, 0x82, 0xf4, 0x2d, 0x18, 0xc1, 0xb7, 0x6e, 0xee, 0x37, 0x41, 0x98, 0xad, 0x74, 0x2, 0xdb, 0x68, 0xb1, 0xc7, 0x1e, 0x2b, 0xf2, 0x84, 0x5d, 0xff, 0x26, 0x50, 0x89, 0xbc, 0x65, 0x13, 0xca, 0x79, 0xa0, 0xd6, 0xf, 0x3a, 0xe3, 0x95, 0x4c},
+ {0x0, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0xb, 0x78, 0xa2, 0x21, 0xfb, 0x88, 0x52, 0x6e, 0xb4, 0xc7, 0x1d, 0xbf, 0x65, 0x16, 0xcc, 0xf0, 0x2a, 0x59, 0x83, 0x42, 0x98, 0xeb, 0x31, 0xd, 0xd7, 0xa4, 0x7e, 0xdc, 0x6, 0x75, 0xaf, 0x93, 0x49, 0x3a, 0xe0, 0x63, 0xb9, 0xca, 0x10, 0x2c, 0xf6, 0x85, 0x5f, 0xfd, 0x27, 0x54, 0x8e, 0xb2, 0x68, 0x1b, 0xc1, 0x84, 0x5e, 0x2d, 0xf7, 0xcb, 0x11, 0x62, 0xb8, 0x1a, 0xc0, 0xb3, 0x69, 0x55, 0x8f, 0xfc, 0x26, 0xa5, 0x7f, 0xc, 0xd6, 0xea, 0x30, 0x43, 0x99, 0x3b, 0xe1, 0x92, 0x48, 0x74, 0xae, 0xdd, 0x7, 0xc6, 0x1c, 0x6f, 0xb5, 0x89, 0x53, 0x20, 0xfa, 0x58, 0x82, 0xf1, 0x2b, 0x17, 0xcd, 0xbe, 0x64, 0xe7, 0x3d, 0x4e, 0x94, 0xa8, 0x72, 0x1, 0xdb, 0x79, 0xa3, 0xd0, 0xa, 0x36, 0xec, 0x9f, 0x45, 0x15, 0xcf, 0xbc, 0x66, 0x5a, 0x80, 0xf3, 0x29, 0x8b, 0x51, 0x22, 0xf8, 0xc4, 0x1e, 0x6d, 0xb7, 0x34, 0xee, 0x9d, 0x47, 0x7b, 0xa1, 0xd2, 0x8, 0xaa, 0x70, 0x3, 0xd9, 0xe5, 0x3f, 0x4c, 0x96, 0x57, 0x8d, 0xfe, 0x24, 0x18, 0xc2, 0xb1, 0x6b, 0xc9, 0x13, 0x60, 0xba, 0x86, 0x5c, 0x2f, 0xf5, 0x76, 0xac, 0xdf, 0x5, 0x39, 0xe3, 0x90, 0x4a, 0xe8, 0x32, 0x41, 0x9b, 0xa7, 0x7d, 0xe, 0xd4, 0x91, 0x4b, 0x38, 0xe2, 0xde, 0x4, 0x77, 0xad, 0xf, 0xd5, 0xa6, 0x7c, 0x40, 0x9a, 0xe9, 0x33, 0xb0, 0x6a, 0x19, 0xc3, 0xff, 0x25, 0x56, 0x8c, 0x2e, 0xf4, 0x87, 0x5d, 0x61, 0xbb, 0xc8, 0x12, 0xd3, 0x9, 0x7a, 0xa0, 0x9c, 0x46, 0x35, 0xef, 0x4d, 0x97, 0xe4, 0x3e, 0x2, 0xd8, 0xab, 0x71, 0xf2, 0x28, 0x5b, 0x81, 0xbd, 0x67, 0x14, 0xce, 0x6c, 0xb6, 0xc5, 0x1f, 0x23, 0xf9, 0x8a, 0x50},
+ {0x0, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x6, 0x76, 0xad, 0x31, 0xea, 0x9a, 0x41, 0x7a, 0xa1, 0xd1, 0xa, 0xa7, 0x7c, 0xc, 0xd7, 0xec, 0x37, 0x47, 0x9c, 0x62, 0xb9, 0xc9, 0x12, 0x29, 0xf2, 0x82, 0x59, 0xf4, 0x2f, 0x5f, 0x84, 0xbf, 0x64, 0x14, 0xcf, 0x53, 0x88, 0xf8, 0x23, 0x18, 0xc3, 0xb3, 0x68, 0xc5, 0x1e, 0x6e, 0xb5, 0x8e, 0x55, 0x25, 0xfe, 0xc4, 0x1f, 0x6f, 0xb4, 0x8f, 0x54, 0x24, 0xff, 0x52, 0x89, 0xf9, 0x22, 0x19, 0xc2, 0xb2, 0x69, 0xf5, 0x2e, 0x5e, 0x85, 0xbe, 0x65, 0x15, 0xce, 0x63, 0xb8, 0xc8, 0x13, 0x28, 0xf3, 0x83, 0x58, 0xa6, 0x7d, 0xd, 0xd6, 0xed, 0x36, 0x46, 0x9d, 0x30, 0xeb, 0x9b, 0x40, 0x7b, 0xa0, 0xd0, 0xb, 0x97, 0x4c, 0x3c, 0xe7, 0xdc, 0x7, 0x77, 0xac, 0x1, 0xda, 0xaa, 0x71, 0x4a, 0x91, 0xe1, 0x3a, 0x95, 0x4e, 0x3e, 0xe5, 0xde, 0x5, 0x75, 0xae, 0x3, 0xd8, 0xa8, 0x73, 0x48, 0x93, 0xe3, 0x38, 0xa4, 0x7f, 0xf, 0xd4, 0xef, 0x34, 0x44, 0x9f, 0x32, 0xe9, 0x99, 0x42, 0x79, 0xa2, 0xd2, 0x9, 0xf7, 0x2c, 0x5c, 0x87, 0xbc, 0x67, 0x17, 0xcc, 0x61, 0xba, 0xca, 0x11, 0x2a, 0xf1, 0x81, 0x5a, 0xc6, 0x1d, 0x6d, 0xb6, 0x8d, 0x56, 0x26, 0xfd, 0x50, 0x8b, 0xfb, 0x20, 0x1b, 0xc0, 0xb0, 0x6b, 0x51, 0x8a, 0xfa, 0x21, 0x1a, 0xc1, 0xb1, 0x6a, 0xc7, 0x1c, 0x6c, 0xb7, 0x8c, 0x57, 0x27, 0xfc, 0x60, 0xbb, 0xcb, 0x10, 0x2b, 0xf0, 0x80, 0x5b, 0xf6, 0x2d, 0x5d, 0x86, 0xbd, 0x66, 0x16, 0xcd, 0x33, 0xe8, 0x98, 0x43, 0x78, 0xa3, 0xd3, 0x8, 0xa5, 0x7e, 0xe, 0xd5, 0xee, 0x35, 0x45, 0x9e, 0x2, 0xd9, 0xa9, 0x72, 0x49, 0x92, 0xe2, 0x39, 0x94, 0x4f, 0x3f, 0xe4, 0xdf, 0x4, 0x74, 0xaf},
+ {0x0, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0xb, 0xd7, 0xf9, 0x25, 0x5c, 0x80, 0x41, 0x9d, 0xe4, 0x38, 0x16, 0xca, 0xb3, 0x6f, 0xef, 0x33, 0x4a, 0x96, 0xb8, 0x64, 0x1d, 0xc1, 0x82, 0x5e, 0x27, 0xfb, 0xd5, 0x9, 0x70, 0xac, 0x2c, 0xf0, 0x89, 0x55, 0x7b, 0xa7, 0xde, 0x2, 0xc3, 0x1f, 0x66, 0xba, 0x94, 0x48, 0x31, 0xed, 0x6d, 0xb1, 0xc8, 0x14, 0x3a, 0xe6, 0x9f, 0x43, 0x19, 0xc5, 0xbc, 0x60, 0x4e, 0x92, 0xeb, 0x37, 0xb7, 0x6b, 0x12, 0xce, 0xe0, 0x3c, 0x45, 0x99, 0x58, 0x84, 0xfd, 0x21, 0xf, 0xd3, 0xaa, 0x76, 0xf6, 0x2a, 0x53, 0x8f, 0xa1, 0x7d, 0x4, 0xd8, 0x9b, 0x47, 0x3e, 0xe2, 0xcc, 0x10, 0x69, 0xb5, 0x35, 0xe9, 0x90, 0x4c, 0x62, 0xbe, 0xc7, 0x1b, 0xda, 0x6, 0x7f, 0xa3, 0x8d, 0x51, 0x28, 0xf4, 0x74, 0xa8, 0xd1, 0xd, 0x23, 0xff, 0x86, 0x5a, 0x32, 0xee, 0x97, 0x4b, 0x65, 0xb9, 0xc0, 0x1c, 0x9c, 0x40, 0x39, 0xe5, 0xcb, 0x17, 0x6e, 0xb2, 0x73, 0xaf, 0xd6, 0xa, 0x24, 0xf8, 0x81, 0x5d, 0xdd, 0x1, 0x78, 0xa4, 0x8a, 0x56, 0x2f, 0xf3, 0xb0, 0x6c, 0x15, 0xc9, 0xe7, 0x3b, 0x42, 0x9e, 0x1e, 0xc2, 0xbb, 0x67, 0x49, 0x95, 0xec, 0x30, 0xf1, 0x2d, 0x54, 0x88, 0xa6, 0x7a, 0x3, 0xdf, 0x5f, 0x83, 0xfa, 0x26, 0x8, 0xd4, 0xad, 0x71, 0x2b, 0xf7, 0x8e, 0x52, 0x7c, 0xa0, 0xd9, 0x5, 0x85, 0x59, 0x20, 0xfc, 0xd2, 0xe, 0x77, 0xab, 0x6a, 0xb6, 0xcf, 0x13, 0x3d, 0xe1, 0x98, 0x44, 0xc4, 0x18, 0x61, 0xbd, 0x93, 0x4f, 0x36, 0xea, 0xa9, 0x75, 0xc, 0xd0, 0xfe, 0x22, 0x5b, 0x87, 0x7, 0xdb, 0xa2, 0x7e, 0x50, 0x8c, 0xf5, 0x29, 0xe8, 0x34, 0x4d, 0x91, 0xbf, 0x63, 0x1a, 0xc6, 0x46, 0x9a, 0xe3, 0x3f, 0x11, 0xcd, 0xb4, 0x68},
+ {0x0, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x1, 0xdc, 0xf5, 0x28, 0x52, 0x8f, 0x51, 0x8c, 0xf6, 0x2b, 0x2, 0xdf, 0xa5, 0x78, 0xf7, 0x2a, 0x50, 0x8d, 0xa4, 0x79, 0x3, 0xde, 0xa2, 0x7f, 0x5, 0xd8, 0xf1, 0x2c, 0x56, 0x8b, 0x4, 0xd9, 0xa3, 0x7e, 0x57, 0x8a, 0xf0, 0x2d, 0xf3, 0x2e, 0x54, 0x89, 0xa0, 0x7d, 0x7, 0xda, 0x55, 0x88, 0xf2, 0x2f, 0x6, 0xdb, 0xa1, 0x7c, 0x59, 0x84, 0xfe, 0x23, 0xa, 0xd7, 0xad, 0x70, 0xff, 0x22, 0x58, 0x85, 0xac, 0x71, 0xb, 0xd6, 0x8, 0xd5, 0xaf, 0x72, 0x5b, 0x86, 0xfc, 0x21, 0xae, 0x73, 0x9, 0xd4, 0xfd, 0x20, 0x5a, 0x87, 0xfb, 0x26, 0x5c, 0x81, 0xa8, 0x75, 0xf, 0xd2, 0x5d, 0x80, 0xfa, 0x27, 0xe, 0xd3, 0xa9, 0x74, 0xaa, 0x77, 0xd, 0xd0, 0xf9, 0x24, 0x5e, 0x83, 0xc, 0xd1, 0xab, 0x76, 0x5f, 0x82, 0xf8, 0x25, 0xb2, 0x6f, 0x15, 0xc8, 0xe1, 0x3c, 0x46, 0x9b, 0x14, 0xc9, 0xb3, 0x6e, 0x47, 0x9a, 0xe0, 0x3d, 0xe3, 0x3e, 0x44, 0x99, 0xb0, 0x6d, 0x17, 0xca, 0x45, 0x98, 0xe2, 0x3f, 0x16, 0xcb, 0xb1, 0x6c, 0x10, 0xcd, 0xb7, 0x6a, 0x43, 0x9e, 0xe4, 0x39, 0xb6, 0x6b, 0x11, 0xcc, 0xe5, 0x38, 0x42, 0x9f, 0x41, 0x9c, 0xe6, 0x3b, 0x12, 0xcf, 0xb5, 0x68, 0xe7, 0x3a, 0x40, 0x9d, 0xb4, 0x69, 0x13, 0xce, 0xeb, 0x36, 0x4c, 0x91, 0xb8, 0x65, 0x1f, 0xc2, 0x4d, 0x90, 0xea, 0x37, 0x1e, 0xc3, 0xb9, 0x64, 0xba, 0x67, 0x1d, 0xc0, 0xe9, 0x34, 0x4e, 0x93, 0x1c, 0xc1, 0xbb, 0x66, 0x4f, 0x92, 0xe8, 0x35, 0x49, 0x94, 0xee, 0x33, 0x1a, 0xc7, 0xbd, 0x60, 0xef, 0x32, 0x48, 0x95, 0xbc, 0x61, 0x1b, 0xc6, 0x18, 0xc5, 0xbf, 0x62, 0x4b, 0x96, 0xec, 0x31, 0xbe, 0x63, 0x19, 0xc4, 0xed, 0x30, 0x4a, 0x97},
+ {0x0, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e, 0x61, 0xbf, 0xc0, 0x1e, 0x3e, 0xe0, 0x9f, 0x41, 0xdf, 0x1, 0x7e, 0xa0, 0x80, 0x5e, 0x21, 0xff, 0xc2, 0x1c, 0x63, 0xbd, 0x9d, 0x43, 0x3c, 0xe2, 0x7c, 0xa2, 0xdd, 0x3, 0x23, 0xfd, 0x82, 0x5c, 0xa3, 0x7d, 0x2, 0xdc, 0xfc, 0x22, 0x5d, 0x83, 0x1d, 0xc3, 0xbc, 0x62, 0x42, 0x9c, 0xe3, 0x3d, 0x99, 0x47, 0x38, 0xe6, 0xc6, 0x18, 0x67, 0xb9, 0x27, 0xf9, 0x86, 0x58, 0x78, 0xa6, 0xd9, 0x7, 0xf8, 0x26, 0x59, 0x87, 0xa7, 0x79, 0x6, 0xd8, 0x46, 0x98, 0xe7, 0x39, 0x19, 0xc7, 0xb8, 0x66, 0x5b, 0x85, 0xfa, 0x24, 0x4, 0xda, 0xa5, 0x7b, 0xe5, 0x3b, 0x44, 0x9a, 0xba, 0x64, 0x1b, 0xc5, 0x3a, 0xe4, 0x9b, 0x45, 0x65, 0xbb, 0xc4, 0x1a, 0x84, 0x5a, 0x25, 0xfb, 0xdb, 0x5, 0x7a, 0xa4, 0x2f, 0xf1, 0x8e, 0x50, 0x70, 0xae, 0xd1, 0xf, 0x91, 0x4f, 0x30, 0xee, 0xce, 0x10, 0x6f, 0xb1, 0x4e, 0x90, 0xef, 0x31, 0x11, 0xcf, 0xb0, 0x6e, 0xf0, 0x2e, 0x51, 0x8f, 0xaf, 0x71, 0xe, 0xd0, 0xed, 0x33, 0x4c, 0x92, 0xb2, 0x6c, 0x13, 0xcd, 0x53, 0x8d, 0xf2, 0x2c, 0xc, 0xd2, 0xad, 0x73, 0x8c, 0x52, 0x2d, 0xf3, 0xd3, 0xd, 0x72, 0xac, 0x32, 0xec, 0x93, 0x4d, 0x6d, 0xb3, 0xcc, 0x12, 0xb6, 0x68, 0x17, 0xc9, 0xe9, 0x37, 0x48, 0x96, 0x8, 0xd6, 0xa9, 0x77, 0x57, 0x89, 0xf6, 0x28, 0xd7, 0x9, 0x76, 0xa8, 0x88, 0x56, 0x29, 0xf7, 0x69, 0xb7, 0xc8, 0x16, 0x36, 0xe8, 0x97, 0x49, 0x74, 0xaa, 0xd5, 0xb, 0x2b, 0xf5, 0x8a, 0x54, 0xca, 0x14, 0x6b, 0xb5, 0x95, 0x4b, 0x34, 0xea, 0x15, 0xcb, 0xb4, 0x6a, 0x4a, 0x94, 0xeb, 0x35, 0xab, 0x75, 0xa, 0xd4, 0xf4, 0x2a, 0x55, 0x8b},
+ {0x0, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91, 0x71, 0xae, 0xd2, 0xd, 0x2a, 0xf5, 0x89, 0x56, 0xc7, 0x18, 0x64, 0xbb, 0x9c, 0x43, 0x3f, 0xe0, 0xe2, 0x3d, 0x41, 0x9e, 0xb9, 0x66, 0x1a, 0xc5, 0x54, 0x8b, 0xf7, 0x28, 0xf, 0xd0, 0xac, 0x73, 0x93, 0x4c, 0x30, 0xef, 0xc8, 0x17, 0x6b, 0xb4, 0x25, 0xfa, 0x86, 0x59, 0x7e, 0xa1, 0xdd, 0x2, 0xd9, 0x6, 0x7a, 0xa5, 0x82, 0x5d, 0x21, 0xfe, 0x6f, 0xb0, 0xcc, 0x13, 0x34, 0xeb, 0x97, 0x48, 0xa8, 0x77, 0xb, 0xd4, 0xf3, 0x2c, 0x50, 0x8f, 0x1e, 0xc1, 0xbd, 0x62, 0x45, 0x9a, 0xe6, 0x39, 0x3b, 0xe4, 0x98, 0x47, 0x60, 0xbf, 0xc3, 0x1c, 0x8d, 0x52, 0x2e, 0xf1, 0xd6, 0x9, 0x75, 0xaa, 0x4a, 0x95, 0xe9, 0x36, 0x11, 0xce, 0xb2, 0x6d, 0xfc, 0x23, 0x5f, 0x80, 0xa7, 0x78, 0x4, 0xdb, 0xaf, 0x70, 0xc, 0xd3, 0xf4, 0x2b, 0x57, 0x88, 0x19, 0xc6, 0xba, 0x65, 0x42, 0x9d, 0xe1, 0x3e, 0xde, 0x1, 0x7d, 0xa2, 0x85, 0x5a, 0x26, 0xf9, 0x68, 0xb7, 0xcb, 0x14, 0x33, 0xec, 0x90, 0x4f, 0x4d, 0x92, 0xee, 0x31, 0x16, 0xc9, 0xb5, 0x6a, 0xfb, 0x24, 0x58, 0x87, 0xa0, 0x7f, 0x3, 0xdc, 0x3c, 0xe3, 0x9f, 0x40, 0x67, 0xb8, 0xc4, 0x1b, 0x8a, 0x55, 0x29, 0xf6, 0xd1, 0xe, 0x72, 0xad, 0x76, 0xa9, 0xd5, 0xa, 0x2d, 0xf2, 0x8e, 0x51, 0xc0, 0x1f, 0x63, 0xbc, 0x9b, 0x44, 0x38, 0xe7, 0x7, 0xd8, 0xa4, 0x7b, 0x5c, 0x83, 0xff, 0x20, 0xb1, 0x6e, 0x12, 0xcd, 0xea, 0x35, 0x49, 0x96, 0x94, 0x4b, 0x37, 0xe8, 0xcf, 0x10, 0x6c, 0xb3, 0x22, 0xfd, 0x81, 0x5e, 0x79, 0xa6, 0xda, 0x5, 0xe5, 0x3a, 0x46, 0x99, 0xbe, 0x61, 0x1d, 0xc2, 0x53, 0x8c, 0xf0, 0x2f, 0x8, 0xd7, 0xab, 0x74},
+ {0x0, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9, 0xa6, 0x46, 0x7b, 0x9b, 0x1, 0xe1, 0xdc, 0x3c, 0xf5, 0x15, 0x28, 0xc8, 0x52, 0xb2, 0x8f, 0x6f, 0x51, 0xb1, 0x8c, 0x6c, 0xf6, 0x16, 0x2b, 0xcb, 0x2, 0xe2, 0xdf, 0x3f, 0xa5, 0x45, 0x78, 0x98, 0xf7, 0x17, 0x2a, 0xca, 0x50, 0xb0, 0x8d, 0x6d, 0xa4, 0x44, 0x79, 0x99, 0x3, 0xe3, 0xde, 0x3e, 0xa2, 0x42, 0x7f, 0x9f, 0x5, 0xe5, 0xd8, 0x38, 0xf1, 0x11, 0x2c, 0xcc, 0x56, 0xb6, 0x8b, 0x6b, 0x4, 0xe4, 0xd9, 0x39, 0xa3, 0x43, 0x7e, 0x9e, 0x57, 0xb7, 0x8a, 0x6a, 0xf0, 0x10, 0x2d, 0xcd, 0xf3, 0x13, 0x2e, 0xce, 0x54, 0xb4, 0x89, 0x69, 0xa0, 0x40, 0x7d, 0x9d, 0x7, 0xe7, 0xda, 0x3a, 0x55, 0xb5, 0x88, 0x68, 0xf2, 0x12, 0x2f, 0xcf, 0x6, 0xe6, 0xdb, 0x3b, 0xa1, 0x41, 0x7c, 0x9c, 0x59, 0xb9, 0x84, 0x64, 0xfe, 0x1e, 0x23, 0xc3, 0xa, 0xea, 0xd7, 0x37, 0xad, 0x4d, 0x70, 0x90, 0xff, 0x1f, 0x22, 0xc2, 0x58, 0xb8, 0x85, 0x65, 0xac, 0x4c, 0x71, 0x91, 0xb, 0xeb, 0xd6, 0x36, 0x8, 0xe8, 0xd5, 0x35, 0xaf, 0x4f, 0x72, 0x92, 0x5b, 0xbb, 0x86, 0x66, 0xfc, 0x1c, 0x21, 0xc1, 0xae, 0x4e, 0x73, 0x93, 0x9, 0xe9, 0xd4, 0x34, 0xfd, 0x1d, 0x20, 0xc0, 0x5a, 0xba, 0x87, 0x67, 0xfb, 0x1b, 0x26, 0xc6, 0x5c, 0xbc, 0x81, 0x61, 0xa8, 0x48, 0x75, 0x95, 0xf, 0xef, 0xd2, 0x32, 0x5d, 0xbd, 0x80, 0x60, 0xfa, 0x1a, 0x27, 0xc7, 0xe, 0xee, 0xd3, 0x33, 0xa9, 0x49, 0x74, 0x94, 0xaa, 0x4a, 0x77, 0x97, 0xd, 0xed, 0xd0, 0x30, 0xf9, 0x19, 0x24, 0xc4, 0x5e, 0xbe, 0x83, 0x63, 0xc, 0xec, 0xd1, 0x31, 0xab, 0x4b, 0x76, 0x96, 0x5f, 0xbf, 0x82, 0x62, 0xf8, 0x18, 0x25, 0xc5},
+ {0x0, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6, 0xb6, 0x57, 0x69, 0x88, 0x15, 0xf4, 0xca, 0x2b, 0xed, 0xc, 0x32, 0xd3, 0x4e, 0xaf, 0x91, 0x70, 0x71, 0x90, 0xae, 0x4f, 0xd2, 0x33, 0xd, 0xec, 0x2a, 0xcb, 0xf5, 0x14, 0x89, 0x68, 0x56, 0xb7, 0xc7, 0x26, 0x18, 0xf9, 0x64, 0x85, 0xbb, 0x5a, 0x9c, 0x7d, 0x43, 0xa2, 0x3f, 0xde, 0xe0, 0x1, 0xe2, 0x3, 0x3d, 0xdc, 0x41, 0xa0, 0x9e, 0x7f, 0xb9, 0x58, 0x66, 0x87, 0x1a, 0xfb, 0xc5, 0x24, 0x54, 0xb5, 0x8b, 0x6a, 0xf7, 0x16, 0x28, 0xc9, 0xf, 0xee, 0xd0, 0x31, 0xac, 0x4d, 0x73, 0x92, 0x93, 0x72, 0x4c, 0xad, 0x30, 0xd1, 0xef, 0xe, 0xc8, 0x29, 0x17, 0xf6, 0x6b, 0x8a, 0xb4, 0x55, 0x25, 0xc4, 0xfa, 0x1b, 0x86, 0x67, 0x59, 0xb8, 0x7e, 0x9f, 0xa1, 0x40, 0xdd, 0x3c, 0x2, 0xe3, 0xd9, 0x38, 0x6, 0xe7, 0x7a, 0x9b, 0xa5, 0x44, 0x82, 0x63, 0x5d, 0xbc, 0x21, 0xc0, 0xfe, 0x1f, 0x6f, 0x8e, 0xb0, 0x51, 0xcc, 0x2d, 0x13, 0xf2, 0x34, 0xd5, 0xeb, 0xa, 0x97, 0x76, 0x48, 0xa9, 0xa8, 0x49, 0x77, 0x96, 0xb, 0xea, 0xd4, 0x35, 0xf3, 0x12, 0x2c, 0xcd, 0x50, 0xb1, 0x8f, 0x6e, 0x1e, 0xff, 0xc1, 0x20, 0xbd, 0x5c, 0x62, 0x83, 0x45, 0xa4, 0x9a, 0x7b, 0xe6, 0x7, 0x39, 0xd8, 0x3b, 0xda, 0xe4, 0x5, 0x98, 0x79, 0x47, 0xa6, 0x60, 0x81, 0xbf, 0x5e, 0xc3, 0x22, 0x1c, 0xfd, 0x8d, 0x6c, 0x52, 0xb3, 0x2e, 0xcf, 0xf1, 0x10, 0xd6, 0x37, 0x9, 0xe8, 0x75, 0x94, 0xaa, 0x4b, 0x4a, 0xab, 0x95, 0x74, 0xe9, 0x8, 0x36, 0xd7, 0x11, 0xf0, 0xce, 0x2f, 0xb2, 0x53, 0x6d, 0x8c, 0xfc, 0x1d, 0x23, 0xc2, 0x5f, 0xbe, 0x80, 0x61, 0xa7, 0x46, 0x78, 0x99, 0x4, 0xe5, 0xdb, 0x3a},
+ {0x0, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0xe, 0x35, 0xd7, 0x86, 0x64, 0x5f, 0xbd, 0x29, 0xcb, 0xf0, 0x12, 0xc5, 0x27, 0x1c, 0xfe, 0x6a, 0x88, 0xb3, 0x51, 0x11, 0xf3, 0xc8, 0x2a, 0xbe, 0x5c, 0x67, 0x85, 0x52, 0xb0, 0x8b, 0x69, 0xfd, 0x1f, 0x24, 0xc6, 0x97, 0x75, 0x4e, 0xac, 0x38, 0xda, 0xe1, 0x3, 0xd4, 0x36, 0xd, 0xef, 0x7b, 0x99, 0xa2, 0x40, 0x22, 0xc0, 0xfb, 0x19, 0x8d, 0x6f, 0x54, 0xb6, 0x61, 0x83, 0xb8, 0x5a, 0xce, 0x2c, 0x17, 0xf5, 0xa4, 0x46, 0x7d, 0x9f, 0xb, 0xe9, 0xd2, 0x30, 0xe7, 0x5, 0x3e, 0xdc, 0x48, 0xaa, 0x91, 0x73, 0x33, 0xd1, 0xea, 0x8, 0x9c, 0x7e, 0x45, 0xa7, 0x70, 0x92, 0xa9, 0x4b, 0xdf, 0x3d, 0x6, 0xe4, 0xb5, 0x57, 0x6c, 0x8e, 0x1a, 0xf8, 0xc3, 0x21, 0xf6, 0x14, 0x2f, 0xcd, 0x59, 0xbb, 0x80, 0x62, 0x44, 0xa6, 0x9d, 0x7f, 0xeb, 0x9, 0x32, 0xd0, 0x7, 0xe5, 0xde, 0x3c, 0xa8, 0x4a, 0x71, 0x93, 0xc2, 0x20, 0x1b, 0xf9, 0x6d, 0x8f, 0xb4, 0x56, 0x81, 0x63, 0x58, 0xba, 0x2e, 0xcc, 0xf7, 0x15, 0x55, 0xb7, 0x8c, 0x6e, 0xfa, 0x18, 0x23, 0xc1, 0x16, 0xf4, 0xcf, 0x2d, 0xb9, 0x5b, 0x60, 0x82, 0xd3, 0x31, 0xa, 0xe8, 0x7c, 0x9e, 0xa5, 0x47, 0x90, 0x72, 0x49, 0xab, 0x3f, 0xdd, 0xe6, 0x4, 0x66, 0x84, 0xbf, 0x5d, 0xc9, 0x2b, 0x10, 0xf2, 0x25, 0xc7, 0xfc, 0x1e, 0x8a, 0x68, 0x53, 0xb1, 0xe0, 0x2, 0x39, 0xdb, 0x4f, 0xad, 0x96, 0x74, 0xa3, 0x41, 0x7a, 0x98, 0xc, 0xee, 0xd5, 0x37, 0x77, 0x95, 0xae, 0x4c, 0xd8, 0x3a, 0x1, 0xe3, 0x34, 0xd6, 0xed, 0xf, 0x9b, 0x79, 0x42, 0xa0, 0xf1, 0x13, 0x28, 0xca, 0x5e, 0xbc, 0x87, 0x65, 0xb2, 0x50, 0x6b, 0x89, 0x1d, 0xff, 0xc4, 0x26},
+ {0x0, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x3, 0x3b, 0xd8, 0x96, 0x75, 0x4d, 0xae, 0x3d, 0xde, 0xe6, 0x5, 0xdd, 0x3e, 0x6, 0xe5, 0x76, 0x95, 0xad, 0x4e, 0x31, 0xd2, 0xea, 0x9, 0x9a, 0x79, 0x41, 0xa2, 0x7a, 0x99, 0xa1, 0x42, 0xd1, 0x32, 0xa, 0xe9, 0xa7, 0x44, 0x7c, 0x9f, 0xc, 0xef, 0xd7, 0x34, 0xec, 0xf, 0x37, 0xd4, 0x47, 0xa4, 0x9c, 0x7f, 0x62, 0x81, 0xb9, 0x5a, 0xc9, 0x2a, 0x12, 0xf1, 0x29, 0xca, 0xf2, 0x11, 0x82, 0x61, 0x59, 0xba, 0xf4, 0x17, 0x2f, 0xcc, 0x5f, 0xbc, 0x84, 0x67, 0xbf, 0x5c, 0x64, 0x87, 0x14, 0xf7, 0xcf, 0x2c, 0x53, 0xb0, 0x88, 0x6b, 0xf8, 0x1b, 0x23, 0xc0, 0x18, 0xfb, 0xc3, 0x20, 0xb3, 0x50, 0x68, 0x8b, 0xc5, 0x26, 0x1e, 0xfd, 0x6e, 0x8d, 0xb5, 0x56, 0x8e, 0x6d, 0x55, 0xb6, 0x25, 0xc6, 0xfe, 0x1d, 0xc4, 0x27, 0x1f, 0xfc, 0x6f, 0x8c, 0xb4, 0x57, 0x8f, 0x6c, 0x54, 0xb7, 0x24, 0xc7, 0xff, 0x1c, 0x52, 0xb1, 0x89, 0x6a, 0xf9, 0x1a, 0x22, 0xc1, 0x19, 0xfa, 0xc2, 0x21, 0xb2, 0x51, 0x69, 0x8a, 0xf5, 0x16, 0x2e, 0xcd, 0x5e, 0xbd, 0x85, 0x66, 0xbe, 0x5d, 0x65, 0x86, 0x15, 0xf6, 0xce, 0x2d, 0x63, 0x80, 0xb8, 0x5b, 0xc8, 0x2b, 0x13, 0xf0, 0x28, 0xcb, 0xf3, 0x10, 0x83, 0x60, 0x58, 0xbb, 0xa6, 0x45, 0x7d, 0x9e, 0xd, 0xee, 0xd6, 0x35, 0xed, 0xe, 0x36, 0xd5, 0x46, 0xa5, 0x9d, 0x7e, 0x30, 0xd3, 0xeb, 0x8, 0x9b, 0x78, 0x40, 0xa3, 0x7b, 0x98, 0xa0, 0x43, 0xd0, 0x33, 0xb, 0xe8, 0x97, 0x74, 0x4c, 0xaf, 0x3c, 0xdf, 0xe7, 0x4, 0xdc, 0x3f, 0x7, 0xe4, 0x77, 0x94, 0xac, 0x4f, 0x1, 0xe2, 0xda, 0x39, 0xaa, 0x49, 0x71, 0x92, 0x4a, 0xa9, 0x91, 0x72, 0xe1, 0x2, 0x3a, 0xd9},
+ {0x0, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5, 0xe6, 0x2, 0x33, 0xd7, 0x51, 0xb5, 0x84, 0x60, 0x95, 0x71, 0x40, 0xa4, 0x22, 0xc6, 0xf7, 0x13, 0xd1, 0x35, 0x4, 0xe0, 0x66, 0x82, 0xb3, 0x57, 0xa2, 0x46, 0x77, 0x93, 0x15, 0xf1, 0xc0, 0x24, 0x37, 0xd3, 0xe2, 0x6, 0x80, 0x64, 0x55, 0xb1, 0x44, 0xa0, 0x91, 0x75, 0xf3, 0x17, 0x26, 0xc2, 0xbf, 0x5b, 0x6a, 0x8e, 0x8, 0xec, 0xdd, 0x39, 0xcc, 0x28, 0x19, 0xfd, 0x7b, 0x9f, 0xae, 0x4a, 0x59, 0xbd, 0x8c, 0x68, 0xee, 0xa, 0x3b, 0xdf, 0x2a, 0xce, 0xff, 0x1b, 0x9d, 0x79, 0x48, 0xac, 0x6e, 0x8a, 0xbb, 0x5f, 0xd9, 0x3d, 0xc, 0xe8, 0x1d, 0xf9, 0xc8, 0x2c, 0xaa, 0x4e, 0x7f, 0x9b, 0x88, 0x6c, 0x5d, 0xb9, 0x3f, 0xdb, 0xea, 0xe, 0xfb, 0x1f, 0x2e, 0xca, 0x4c, 0xa8, 0x99, 0x7d, 0x63, 0x87, 0xb6, 0x52, 0xd4, 0x30, 0x1, 0xe5, 0x10, 0xf4, 0xc5, 0x21, 0xa7, 0x43, 0x72, 0x96, 0x85, 0x61, 0x50, 0xb4, 0x32, 0xd6, 0xe7, 0x3, 0xf6, 0x12, 0x23, 0xc7, 0x41, 0xa5, 0x94, 0x70, 0xb2, 0x56, 0x67, 0x83, 0x5, 0xe1, 0xd0, 0x34, 0xc1, 0x25, 0x14, 0xf0, 0x76, 0x92, 0xa3, 0x47, 0x54, 0xb0, 0x81, 0x65, 0xe3, 0x7, 0x36, 0xd2, 0x27, 0xc3, 0xf2, 0x16, 0x90, 0x74, 0x45, 0xa1, 0xdc, 0x38, 0x9, 0xed, 0x6b, 0x8f, 0xbe, 0x5a, 0xaf, 0x4b, 0x7a, 0x9e, 0x18, 0xfc, 0xcd, 0x29, 0x3a, 0xde, 0xef, 0xb, 0x8d, 0x69, 0x58, 0xbc, 0x49, 0xad, 0x9c, 0x78, 0xfe, 0x1a, 0x2b, 0xcf, 0xd, 0xe9, 0xd8, 0x3c, 0xba, 0x5e, 0x6f, 0x8b, 0x7e, 0x9a, 0xab, 0x4f, 0xc9, 0x2d, 0x1c, 0xf8, 0xeb, 0xf, 0x3e, 0xda, 0x5c, 0xb8, 0x89, 0x6d, 0x98, 0x7c, 0x4d, 0xa9, 0x2f, 0xcb, 0xfa, 0x1e},
+ {0x0, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa, 0xf6, 0x13, 0x21, 0xc4, 0x45, 0xa0, 0x92, 0x77, 0x8d, 0x68, 0x5a, 0xbf, 0x3e, 0xdb, 0xe9, 0xc, 0xf1, 0x14, 0x26, 0xc3, 0x42, 0xa7, 0x95, 0x70, 0x8a, 0x6f, 0x5d, 0xb8, 0x39, 0xdc, 0xee, 0xb, 0x7, 0xe2, 0xd0, 0x35, 0xb4, 0x51, 0x63, 0x86, 0x7c, 0x99, 0xab, 0x4e, 0xcf, 0x2a, 0x18, 0xfd, 0xff, 0x1a, 0x28, 0xcd, 0x4c, 0xa9, 0x9b, 0x7e, 0x84, 0x61, 0x53, 0xb6, 0x37, 0xd2, 0xe0, 0x5, 0x9, 0xec, 0xde, 0x3b, 0xba, 0x5f, 0x6d, 0x88, 0x72, 0x97, 0xa5, 0x40, 0xc1, 0x24, 0x16, 0xf3, 0xe, 0xeb, 0xd9, 0x3c, 0xbd, 0x58, 0x6a, 0x8f, 0x75, 0x90, 0xa2, 0x47, 0xc6, 0x23, 0x11, 0xf4, 0xf8, 0x1d, 0x2f, 0xca, 0x4b, 0xae, 0x9c, 0x79, 0x83, 0x66, 0x54, 0xb1, 0x30, 0xd5, 0xe7, 0x2, 0xe3, 0x6, 0x34, 0xd1, 0x50, 0xb5, 0x87, 0x62, 0x98, 0x7d, 0x4f, 0xaa, 0x2b, 0xce, 0xfc, 0x19, 0x15, 0xf0, 0xc2, 0x27, 0xa6, 0x43, 0x71, 0x94, 0x6e, 0x8b, 0xb9, 0x5c, 0xdd, 0x38, 0xa, 0xef, 0x12, 0xf7, 0xc5, 0x20, 0xa1, 0x44, 0x76, 0x93, 0x69, 0x8c, 0xbe, 0x5b, 0xda, 0x3f, 0xd, 0xe8, 0xe4, 0x1, 0x33, 0xd6, 0x57, 0xb2, 0x80, 0x65, 0x9f, 0x7a, 0x48, 0xad, 0x2c, 0xc9, 0xfb, 0x1e, 0x1c, 0xf9, 0xcb, 0x2e, 0xaf, 0x4a, 0x78, 0x9d, 0x67, 0x82, 0xb0, 0x55, 0xd4, 0x31, 0x3, 0xe6, 0xea, 0xf, 0x3d, 0xd8, 0x59, 0xbc, 0x8e, 0x6b, 0x91, 0x74, 0x46, 0xa3, 0x22, 0xc7, 0xf5, 0x10, 0xed, 0x8, 0x3a, 0xdf, 0x5e, 0xbb, 0x89, 0x6c, 0x96, 0x73, 0x41, 0xa4, 0x25, 0xc0, 0xf2, 0x17, 0x1b, 0xfe, 0xcc, 0x29, 0xa8, 0x4d, 0x7f, 0x9a, 0x60, 0x85, 0xb7, 0x52, 0xd3, 0x36, 0x4, 0xe1},
+ {0x0, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0xd, 0xeb, 0xc6, 0x20, 0x17, 0xf1, 0x79, 0x9f, 0xa8, 0x4e, 0xa5, 0x43, 0x74, 0x92, 0x1a, 0xfc, 0xcb, 0x2d, 0x91, 0x77, 0x40, 0xa6, 0x2e, 0xc8, 0xff, 0x19, 0xf2, 0x14, 0x23, 0xc5, 0x4d, 0xab, 0x9c, 0x7a, 0x57, 0xb1, 0x86, 0x60, 0xe8, 0xe, 0x39, 0xdf, 0x34, 0xd2, 0xe5, 0x3, 0x8b, 0x6d, 0x5a, 0xbc, 0x3f, 0xd9, 0xee, 0x8, 0x80, 0x66, 0x51, 0xb7, 0x5c, 0xba, 0x8d, 0x6b, 0xe3, 0x5, 0x32, 0xd4, 0xf9, 0x1f, 0x28, 0xce, 0x46, 0xa0, 0x97, 0x71, 0x9a, 0x7c, 0x4b, 0xad, 0x25, 0xc3, 0xf4, 0x12, 0xae, 0x48, 0x7f, 0x99, 0x11, 0xf7, 0xc0, 0x26, 0xcd, 0x2b, 0x1c, 0xfa, 0x72, 0x94, 0xa3, 0x45, 0x68, 0x8e, 0xb9, 0x5f, 0xd7, 0x31, 0x6, 0xe0, 0xb, 0xed, 0xda, 0x3c, 0xb4, 0x52, 0x65, 0x83, 0x7e, 0x98, 0xaf, 0x49, 0xc1, 0x27, 0x10, 0xf6, 0x1d, 0xfb, 0xcc, 0x2a, 0xa2, 0x44, 0x73, 0x95, 0xb8, 0x5e, 0x69, 0x8f, 0x7, 0xe1, 0xd6, 0x30, 0xdb, 0x3d, 0xa, 0xec, 0x64, 0x82, 0xb5, 0x53, 0xef, 0x9, 0x3e, 0xd8, 0x50, 0xb6, 0x81, 0x67, 0x8c, 0x6a, 0x5d, 0xbb, 0x33, 0xd5, 0xe2, 0x4, 0x29, 0xcf, 0xf8, 0x1e, 0x96, 0x70, 0x47, 0xa1, 0x4a, 0xac, 0x9b, 0x7d, 0xf5, 0x13, 0x24, 0xc2, 0x41, 0xa7, 0x90, 0x76, 0xfe, 0x18, 0x2f, 0xc9, 0x22, 0xc4, 0xf3, 0x15, 0x9d, 0x7b, 0x4c, 0xaa, 0x87, 0x61, 0x56, 0xb0, 0x38, 0xde, 0xe9, 0xf, 0xe4, 0x2, 0x35, 0xd3, 0x5b, 0xbd, 0x8a, 0x6c, 0xd0, 0x36, 0x1, 0xe7, 0x6f, 0x89, 0xbe, 0x58, 0xb3, 0x55, 0x62, 0x84, 0xc, 0xea, 0xdd, 0x3b, 0x16, 0xf0, 0xc7, 0x21, 0xa9, 0x4f, 0x78, 0x9e, 0x75, 0x93, 0xa4, 0x42, 0xca, 0x2c, 0x1b, 0xfd},
+ {0x0, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x3, 0xe4, 0xd6, 0x31, 0x5, 0xe2, 0x6d, 0x8a, 0xbe, 0x59, 0xbd, 0x5a, 0x6e, 0x89, 0x6, 0xe1, 0xd5, 0x32, 0xb1, 0x56, 0x62, 0x85, 0xa, 0xed, 0xd9, 0x3e, 0xda, 0x3d, 0x9, 0xee, 0x61, 0x86, 0xb2, 0x55, 0x67, 0x80, 0xb4, 0x53, 0xdc, 0x3b, 0xf, 0xe8, 0xc, 0xeb, 0xdf, 0x38, 0xb7, 0x50, 0x64, 0x83, 0x7f, 0x98, 0xac, 0x4b, 0xc4, 0x23, 0x17, 0xf0, 0x14, 0xf3, 0xc7, 0x20, 0xaf, 0x48, 0x7c, 0x9b, 0xa9, 0x4e, 0x7a, 0x9d, 0x12, 0xf5, 0xc1, 0x26, 0xc2, 0x25, 0x11, 0xf6, 0x79, 0x9e, 0xaa, 0x4d, 0xce, 0x29, 0x1d, 0xfa, 0x75, 0x92, 0xa6, 0x41, 0xa5, 0x42, 0x76, 0x91, 0x1e, 0xf9, 0xcd, 0x2a, 0x18, 0xff, 0xcb, 0x2c, 0xa3, 0x44, 0x70, 0x97, 0x73, 0x94, 0xa0, 0x47, 0xc8, 0x2f, 0x1b, 0xfc, 0xfe, 0x19, 0x2d, 0xca, 0x45, 0xa2, 0x96, 0x71, 0x95, 0x72, 0x46, 0xa1, 0x2e, 0xc9, 0xfd, 0x1a, 0x28, 0xcf, 0xfb, 0x1c, 0x93, 0x74, 0x40, 0xa7, 0x43, 0xa4, 0x90, 0x77, 0xf8, 0x1f, 0x2b, 0xcc, 0x4f, 0xa8, 0x9c, 0x7b, 0xf4, 0x13, 0x27, 0xc0, 0x24, 0xc3, 0xf7, 0x10, 0x9f, 0x78, 0x4c, 0xab, 0x99, 0x7e, 0x4a, 0xad, 0x22, 0xc5, 0xf1, 0x16, 0xf2, 0x15, 0x21, 0xc6, 0x49, 0xae, 0x9a, 0x7d, 0x81, 0x66, 0x52, 0xb5, 0x3a, 0xdd, 0xe9, 0xe, 0xea, 0xd, 0x39, 0xde, 0x51, 0xb6, 0x82, 0x65, 0x57, 0xb0, 0x84, 0x63, 0xec, 0xb, 0x3f, 0xd8, 0x3c, 0xdb, 0xef, 0x8, 0x87, 0x60, 0x54, 0xb3, 0x30, 0xd7, 0xe3, 0x4, 0x8b, 0x6c, 0x58, 0xbf, 0x5b, 0xbc, 0x88, 0x6f, 0xe0, 0x7, 0x33, 0xd4, 0xe6, 0x1, 0x35, 0xd2, 0x5d, 0xba, 0x8e, 0x69, 0x8d, 0x6a, 0x5e, 0xb9, 0x36, 0xd1, 0xe5, 0x2},
+ {0x0, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1, 0x26, 0xce, 0xeb, 0x3, 0xa1, 0x49, 0x6c, 0x84, 0x35, 0xdd, 0xf8, 0x10, 0xb2, 0x5a, 0x7f, 0x97, 0x4c, 0xa4, 0x81, 0x69, 0xcb, 0x23, 0x6, 0xee, 0x5f, 0xb7, 0x92, 0x7a, 0xd8, 0x30, 0x15, 0xfd, 0x6a, 0x82, 0xa7, 0x4f, 0xed, 0x5, 0x20, 0xc8, 0x79, 0x91, 0xb4, 0x5c, 0xfe, 0x16, 0x33, 0xdb, 0x98, 0x70, 0x55, 0xbd, 0x1f, 0xf7, 0xd2, 0x3a, 0x8b, 0x63, 0x46, 0xae, 0xc, 0xe4, 0xc1, 0x29, 0xbe, 0x56, 0x73, 0x9b, 0x39, 0xd1, 0xf4, 0x1c, 0xad, 0x45, 0x60, 0x88, 0x2a, 0xc2, 0xe7, 0xf, 0xd4, 0x3c, 0x19, 0xf1, 0x53, 0xbb, 0x9e, 0x76, 0xc7, 0x2f, 0xa, 0xe2, 0x40, 0xa8, 0x8d, 0x65, 0xf2, 0x1a, 0x3f, 0xd7, 0x75, 0x9d, 0xb8, 0x50, 0xe1, 0x9, 0x2c, 0xc4, 0x66, 0x8e, 0xab, 0x43, 0x2d, 0xc5, 0xe0, 0x8, 0xaa, 0x42, 0x67, 0x8f, 0x3e, 0xd6, 0xf3, 0x1b, 0xb9, 0x51, 0x74, 0x9c, 0xb, 0xe3, 0xc6, 0x2e, 0x8c, 0x64, 0x41, 0xa9, 0x18, 0xf0, 0xd5, 0x3d, 0x9f, 0x77, 0x52, 0xba, 0x61, 0x89, 0xac, 0x44, 0xe6, 0xe, 0x2b, 0xc3, 0x72, 0x9a, 0xbf, 0x57, 0xf5, 0x1d, 0x38, 0xd0, 0x47, 0xaf, 0x8a, 0x62, 0xc0, 0x28, 0xd, 0xe5, 0x54, 0xbc, 0x99, 0x71, 0xd3, 0x3b, 0x1e, 0xf6, 0xb5, 0x5d, 0x78, 0x90, 0x32, 0xda, 0xff, 0x17, 0xa6, 0x4e, 0x6b, 0x83, 0x21, 0xc9, 0xec, 0x4, 0x93, 0x7b, 0x5e, 0xb6, 0x14, 0xfc, 0xd9, 0x31, 0x80, 0x68, 0x4d, 0xa5, 0x7, 0xef, 0xca, 0x22, 0xf9, 0x11, 0x34, 0xdc, 0x7e, 0x96, 0xb3, 0x5b, 0xea, 0x2, 0x27, 0xcf, 0x6d, 0x85, 0xa0, 0x48, 0xdf, 0x37, 0x12, 0xfa, 0x58, 0xb0, 0x95, 0x7d, 0xcc, 0x24, 0x1, 0xe9, 0x4b, 0xa3, 0x86, 0x6e},
+ {0x0, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe, 0x36, 0xdf, 0xf9, 0x10, 0xb5, 0x5c, 0x7a, 0x93, 0x2d, 0xc4, 0xe2, 0xb, 0xae, 0x47, 0x61, 0x88, 0x6c, 0x85, 0xa3, 0x4a, 0xef, 0x6, 0x20, 0xc9, 0x77, 0x9e, 0xb8, 0x51, 0xf4, 0x1d, 0x3b, 0xd2, 0x5a, 0xb3, 0x95, 0x7c, 0xd9, 0x30, 0x16, 0xff, 0x41, 0xa8, 0x8e, 0x67, 0xc2, 0x2b, 0xd, 0xe4, 0xd8, 0x31, 0x17, 0xfe, 0x5b, 0xb2, 0x94, 0x7d, 0xc3, 0x2a, 0xc, 0xe5, 0x40, 0xa9, 0x8f, 0x66, 0xee, 0x7, 0x21, 0xc8, 0x6d, 0x84, 0xa2, 0x4b, 0xf5, 0x1c, 0x3a, 0xd3, 0x76, 0x9f, 0xb9, 0x50, 0xb4, 0x5d, 0x7b, 0x92, 0x37, 0xde, 0xf8, 0x11, 0xaf, 0x46, 0x60, 0x89, 0x2c, 0xc5, 0xe3, 0xa, 0x82, 0x6b, 0x4d, 0xa4, 0x1, 0xe8, 0xce, 0x27, 0x99, 0x70, 0x56, 0xbf, 0x1a, 0xf3, 0xd5, 0x3c, 0xad, 0x44, 0x62, 0x8b, 0x2e, 0xc7, 0xe1, 0x8, 0xb6, 0x5f, 0x79, 0x90, 0x35, 0xdc, 0xfa, 0x13, 0x9b, 0x72, 0x54, 0xbd, 0x18, 0xf1, 0xd7, 0x3e, 0x80, 0x69, 0x4f, 0xa6, 0x3, 0xea, 0xcc, 0x25, 0xc1, 0x28, 0xe, 0xe7, 0x42, 0xab, 0x8d, 0x64, 0xda, 0x33, 0x15, 0xfc, 0x59, 0xb0, 0x96, 0x7f, 0xf7, 0x1e, 0x38, 0xd1, 0x74, 0x9d, 0xbb, 0x52, 0xec, 0x5, 0x23, 0xca, 0x6f, 0x86, 0xa0, 0x49, 0x75, 0x9c, 0xba, 0x53, 0xf6, 0x1f, 0x39, 0xd0, 0x6e, 0x87, 0xa1, 0x48, 0xed, 0x4, 0x22, 0xcb, 0x43, 0xaa, 0x8c, 0x65, 0xc0, 0x29, 0xf, 0xe6, 0x58, 0xb1, 0x97, 0x7e, 0xdb, 0x32, 0x14, 0xfd, 0x19, 0xf0, 0xd6, 0x3f, 0x9a, 0x73, 0x55, 0xbc, 0x2, 0xeb, 0xcd, 0x24, 0x81, 0x68, 0x4e, 0xa7, 0x2f, 0xc6, 0xe0, 0x9, 0xac, 0x45, 0x63, 0x8a, 0x34, 0xdd, 0xfb, 0x12, 0xb7, 0x5e, 0x78, 0x91},
+ {0x0, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x3, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf, 0x6, 0xec, 0xcf, 0x25, 0x89, 0x63, 0x40, 0xaa, 0x5, 0xef, 0xcc, 0x26, 0x8a, 0x60, 0x43, 0xa9, 0xc, 0xe6, 0xc5, 0x2f, 0x83, 0x69, 0x4a, 0xa0, 0xf, 0xe5, 0xc6, 0x2c, 0x80, 0x6a, 0x49, 0xa3, 0xa, 0xe0, 0xc3, 0x29, 0x85, 0x6f, 0x4c, 0xa6, 0x9, 0xe3, 0xc0, 0x2a, 0x86, 0x6c, 0x4f, 0xa5, 0x18, 0xf2, 0xd1, 0x3b, 0x97, 0x7d, 0x5e, 0xb4, 0x1b, 0xf1, 0xd2, 0x38, 0x94, 0x7e, 0x5d, 0xb7, 0x1e, 0xf4, 0xd7, 0x3d, 0x91, 0x7b, 0x58, 0xb2, 0x1d, 0xf7, 0xd4, 0x3e, 0x92, 0x78, 0x5b, 0xb1, 0x14, 0xfe, 0xdd, 0x37, 0x9b, 0x71, 0x52, 0xb8, 0x17, 0xfd, 0xde, 0x34, 0x98, 0x72, 0x51, 0xbb, 0x12, 0xf8, 0xdb, 0x31, 0x9d, 0x77, 0x54, 0xbe, 0x11, 0xfb, 0xd8, 0x32, 0x9e, 0x74, 0x57, 0xbd, 0x30, 0xda, 0xf9, 0x13, 0xbf, 0x55, 0x76, 0x9c, 0x33, 0xd9, 0xfa, 0x10, 0xbc, 0x56, 0x75, 0x9f, 0x36, 0xdc, 0xff, 0x15, 0xb9, 0x53, 0x70, 0x9a, 0x35, 0xdf, 0xfc, 0x16, 0xba, 0x50, 0x73, 0x99, 0x3c, 0xd6, 0xf5, 0x1f, 0xb3, 0x59, 0x7a, 0x90, 0x3f, 0xd5, 0xf6, 0x1c, 0xb0, 0x5a, 0x79, 0x93, 0x3a, 0xd0, 0xf3, 0x19, 0xb5, 0x5f, 0x7c, 0x96, 0x39, 0xd3, 0xf0, 0x1a, 0xb6, 0x5c, 0x7f, 0x95, 0x28, 0xc2, 0xe1, 0xb, 0xa7, 0x4d, 0x6e, 0x84, 0x2b, 0xc1, 0xe2, 0x8, 0xa4, 0x4e, 0x6d, 0x87, 0x2e, 0xc4, 0xe7, 0xd, 0xa1, 0x4b, 0x68, 0x82, 0x2d, 0xc7, 0xe4, 0xe, 0xa2, 0x48, 0x6b, 0x81, 0x24, 0xce, 0xed, 0x7, 0xab, 0x41, 0x62, 0x88, 0x27, 0xcd, 0xee, 0x4, 0xa8, 0x42, 0x61, 0x8b, 0x22, 0xc8, 0xeb, 0x1, 0xad, 0x47, 0x64, 0x8e, 0x21, 0xcb, 0xe8, 0x2, 0xae, 0x44, 0x67, 0x8d},
+ {0x0, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0xb, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0, 0x16, 0xfd, 0xdd, 0x36, 0x9d, 0x76, 0x56, 0xbd, 0x1d, 0xf6, 0xd6, 0x3d, 0x96, 0x7d, 0x5d, 0xb6, 0x2c, 0xc7, 0xe7, 0xc, 0xa7, 0x4c, 0x6c, 0x87, 0x27, 0xcc, 0xec, 0x7, 0xac, 0x47, 0x67, 0x8c, 0x3a, 0xd1, 0xf1, 0x1a, 0xb1, 0x5a, 0x7a, 0x91, 0x31, 0xda, 0xfa, 0x11, 0xba, 0x51, 0x71, 0x9a, 0x58, 0xb3, 0x93, 0x78, 0xd3, 0x38, 0x18, 0xf3, 0x53, 0xb8, 0x98, 0x73, 0xd8, 0x33, 0x13, 0xf8, 0x4e, 0xa5, 0x85, 0x6e, 0xc5, 0x2e, 0xe, 0xe5, 0x45, 0xae, 0x8e, 0x65, 0xce, 0x25, 0x5, 0xee, 0x74, 0x9f, 0xbf, 0x54, 0xff, 0x14, 0x34, 0xdf, 0x7f, 0x94, 0xb4, 0x5f, 0xf4, 0x1f, 0x3f, 0xd4, 0x62, 0x89, 0xa9, 0x42, 0xe9, 0x2, 0x22, 0xc9, 0x69, 0x82, 0xa2, 0x49, 0xe2, 0x9, 0x29, 0xc2, 0xb0, 0x5b, 0x7b, 0x90, 0x3b, 0xd0, 0xf0, 0x1b, 0xbb, 0x50, 0x70, 0x9b, 0x30, 0xdb, 0xfb, 0x10, 0xa6, 0x4d, 0x6d, 0x86, 0x2d, 0xc6, 0xe6, 0xd, 0xad, 0x46, 0x66, 0x8d, 0x26, 0xcd, 0xed, 0x6, 0x9c, 0x77, 0x57, 0xbc, 0x17, 0xfc, 0xdc, 0x37, 0x97, 0x7c, 0x5c, 0xb7, 0x1c, 0xf7, 0xd7, 0x3c, 0x8a, 0x61, 0x41, 0xaa, 0x1, 0xea, 0xca, 0x21, 0x81, 0x6a, 0x4a, 0xa1, 0xa, 0xe1, 0xc1, 0x2a, 0xe8, 0x3, 0x23, 0xc8, 0x63, 0x88, 0xa8, 0x43, 0xe3, 0x8, 0x28, 0xc3, 0x68, 0x83, 0xa3, 0x48, 0xfe, 0x15, 0x35, 0xde, 0x75, 0x9e, 0xbe, 0x55, 0xf5, 0x1e, 0x3e, 0xd5, 0x7e, 0x95, 0xb5, 0x5e, 0xc4, 0x2f, 0xf, 0xe4, 0x4f, 0xa4, 0x84, 0x6f, 0xcf, 0x24, 0x4, 0xef, 0x44, 0xaf, 0x8f, 0x64, 0xd2, 0x39, 0x19, 0xf2, 0x59, 0xb2, 0x92, 0x79, 0xd9, 0x32, 0x12, 0xf9, 0x52, 0xb9, 0x99, 0x72},
+ {0x0, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d, 0x66, 0x8a, 0xa3, 0x4f, 0xf1, 0x1d, 0x34, 0xd8, 0x55, 0xb9, 0x90, 0x7c, 0xc2, 0x2e, 0x7, 0xeb, 0xcc, 0x20, 0x9, 0xe5, 0x5b, 0xb7, 0x9e, 0x72, 0xff, 0x13, 0x3a, 0xd6, 0x68, 0x84, 0xad, 0x41, 0xaa, 0x46, 0x6f, 0x83, 0x3d, 0xd1, 0xf8, 0x14, 0x99, 0x75, 0x5c, 0xb0, 0xe, 0xe2, 0xcb, 0x27, 0x85, 0x69, 0x40, 0xac, 0x12, 0xfe, 0xd7, 0x3b, 0xb6, 0x5a, 0x73, 0x9f, 0x21, 0xcd, 0xe4, 0x8, 0xe3, 0xf, 0x26, 0xca, 0x74, 0x98, 0xb1, 0x5d, 0xd0, 0x3c, 0x15, 0xf9, 0x47, 0xab, 0x82, 0x6e, 0x49, 0xa5, 0x8c, 0x60, 0xde, 0x32, 0x1b, 0xf7, 0x7a, 0x96, 0xbf, 0x53, 0xed, 0x1, 0x28, 0xc4, 0x2f, 0xc3, 0xea, 0x6, 0xb8, 0x54, 0x7d, 0x91, 0x1c, 0xf0, 0xd9, 0x35, 0x8b, 0x67, 0x4e, 0xa2, 0x17, 0xfb, 0xd2, 0x3e, 0x80, 0x6c, 0x45, 0xa9, 0x24, 0xc8, 0xe1, 0xd, 0xb3, 0x5f, 0x76, 0x9a, 0x71, 0x9d, 0xb4, 0x58, 0xe6, 0xa, 0x23, 0xcf, 0x42, 0xae, 0x87, 0x6b, 0xd5, 0x39, 0x10, 0xfc, 0xdb, 0x37, 0x1e, 0xf2, 0x4c, 0xa0, 0x89, 0x65, 0xe8, 0x4, 0x2d, 0xc1, 0x7f, 0x93, 0xba, 0x56, 0xbd, 0x51, 0x78, 0x94, 0x2a, 0xc6, 0xef, 0x3, 0x8e, 0x62, 0x4b, 0xa7, 0x19, 0xf5, 0xdc, 0x30, 0x92, 0x7e, 0x57, 0xbb, 0x5, 0xe9, 0xc0, 0x2c, 0xa1, 0x4d, 0x64, 0x88, 0x36, 0xda, 0xf3, 0x1f, 0xf4, 0x18, 0x31, 0xdd, 0x63, 0x8f, 0xa6, 0x4a, 0xc7, 0x2b, 0x2, 0xee, 0x50, 0xbc, 0x95, 0x79, 0x5e, 0xb2, 0x9b, 0x77, 0xc9, 0x25, 0xc, 0xe0, 0x6d, 0x81, 0xa8, 0x44, 0xfa, 0x16, 0x3f, 0xd3, 0x38, 0xd4, 0xfd, 0x11, 0xaf, 0x43, 0x6a, 0x86, 0xb, 0xe7, 0xce, 0x22, 0x9c, 0x70, 0x59, 0xb5},
+ {0x0, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82, 0x76, 0x9b, 0xb1, 0x5c, 0xe5, 0x8, 0x22, 0xcf, 0x4d, 0xa0, 0x8a, 0x67, 0xde, 0x33, 0x19, 0xf4, 0xec, 0x1, 0x2b, 0xc6, 0x7f, 0x92, 0xb8, 0x55, 0xd7, 0x3a, 0x10, 0xfd, 0x44, 0xa9, 0x83, 0x6e, 0x9a, 0x77, 0x5d, 0xb0, 0x9, 0xe4, 0xce, 0x23, 0xa1, 0x4c, 0x66, 0x8b, 0x32, 0xdf, 0xf5, 0x18, 0xc5, 0x28, 0x2, 0xef, 0x56, 0xbb, 0x91, 0x7c, 0xfe, 0x13, 0x39, 0xd4, 0x6d, 0x80, 0xaa, 0x47, 0xb3, 0x5e, 0x74, 0x99, 0x20, 0xcd, 0xe7, 0xa, 0x88, 0x65, 0x4f, 0xa2, 0x1b, 0xf6, 0xdc, 0x31, 0x29, 0xc4, 0xee, 0x3, 0xba, 0x57, 0x7d, 0x90, 0x12, 0xff, 0xd5, 0x38, 0x81, 0x6c, 0x46, 0xab, 0x5f, 0xb2, 0x98, 0x75, 0xcc, 0x21, 0xb, 0xe6, 0x64, 0x89, 0xa3, 0x4e, 0xf7, 0x1a, 0x30, 0xdd, 0x97, 0x7a, 0x50, 0xbd, 0x4, 0xe9, 0xc3, 0x2e, 0xac, 0x41, 0x6b, 0x86, 0x3f, 0xd2, 0xf8, 0x15, 0xe1, 0xc, 0x26, 0xcb, 0x72, 0x9f, 0xb5, 0x58, 0xda, 0x37, 0x1d, 0xf0, 0x49, 0xa4, 0x8e, 0x63, 0x7b, 0x96, 0xbc, 0x51, 0xe8, 0x5, 0x2f, 0xc2, 0x40, 0xad, 0x87, 0x6a, 0xd3, 0x3e, 0x14, 0xf9, 0xd, 0xe0, 0xca, 0x27, 0x9e, 0x73, 0x59, 0xb4, 0x36, 0xdb, 0xf1, 0x1c, 0xa5, 0x48, 0x62, 0x8f, 0x52, 0xbf, 0x95, 0x78, 0xc1, 0x2c, 0x6, 0xeb, 0x69, 0x84, 0xae, 0x43, 0xfa, 0x17, 0x3d, 0xd0, 0x24, 0xc9, 0xe3, 0xe, 0xb7, 0x5a, 0x70, 0x9d, 0x1f, 0xf2, 0xd8, 0x35, 0x8c, 0x61, 0x4b, 0xa6, 0xbe, 0x53, 0x79, 0x94, 0x2d, 0xc0, 0xea, 0x7, 0x85, 0x68, 0x42, 0xaf, 0x16, 0xfb, 0xd1, 0x3c, 0xc8, 0x25, 0xf, 0xe2, 0x5b, 0xb6, 0x9c, 0x71, 0xf3, 0x1e, 0x34, 0xd9, 0x60, 0x8d, 0xa7, 0x4a},
+ {0x0, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0xc, 0xbc, 0x52, 0x7d, 0x93, 0x46, 0xa8, 0x87, 0x69, 0xd9, 0x37, 0x18, 0xf6, 0x65, 0x8b, 0xa4, 0x4a, 0xfa, 0x14, 0x3b, 0xd5, 0x8c, 0x62, 0x4d, 0xa3, 0x13, 0xfd, 0xd2, 0x3c, 0xaf, 0x41, 0x6e, 0x80, 0x30, 0xde, 0xf1, 0x1f, 0xca, 0x24, 0xb, 0xe5, 0x55, 0xbb, 0x94, 0x7a, 0xe9, 0x7, 0x28, 0xc6, 0x76, 0x98, 0xb7, 0x59, 0x5, 0xeb, 0xc4, 0x2a, 0x9a, 0x74, 0x5b, 0xb5, 0x26, 0xc8, 0xe7, 0x9, 0xb9, 0x57, 0x78, 0x96, 0x43, 0xad, 0x82, 0x6c, 0xdc, 0x32, 0x1d, 0xf3, 0x60, 0x8e, 0xa1, 0x4f, 0xff, 0x11, 0x3e, 0xd0, 0x89, 0x67, 0x48, 0xa6, 0x16, 0xf8, 0xd7, 0x39, 0xaa, 0x44, 0x6b, 0x85, 0x35, 0xdb, 0xf4, 0x1a, 0xcf, 0x21, 0xe, 0xe0, 0x50, 0xbe, 0x91, 0x7f, 0xec, 0x2, 0x2d, 0xc3, 0x73, 0x9d, 0xb2, 0x5c, 0xa, 0xe4, 0xcb, 0x25, 0x95, 0x7b, 0x54, 0xba, 0x29, 0xc7, 0xe8, 0x6, 0xb6, 0x58, 0x77, 0x99, 0x4c, 0xa2, 0x8d, 0x63, 0xd3, 0x3d, 0x12, 0xfc, 0x6f, 0x81, 0xae, 0x40, 0xf0, 0x1e, 0x31, 0xdf, 0x86, 0x68, 0x47, 0xa9, 0x19, 0xf7, 0xd8, 0x36, 0xa5, 0x4b, 0x64, 0x8a, 0x3a, 0xd4, 0xfb, 0x15, 0xc0, 0x2e, 0x1, 0xef, 0x5f, 0xb1, 0x9e, 0x70, 0xe3, 0xd, 0x22, 0xcc, 0x7c, 0x92, 0xbd, 0x53, 0xf, 0xe1, 0xce, 0x20, 0x90, 0x7e, 0x51, 0xbf, 0x2c, 0xc2, 0xed, 0x3, 0xb3, 0x5d, 0x72, 0x9c, 0x49, 0xa7, 0x88, 0x66, 0xd6, 0x38, 0x17, 0xf9, 0x6a, 0x84, 0xab, 0x45, 0xf5, 0x1b, 0x34, 0xda, 0x83, 0x6d, 0x42, 0xac, 0x1c, 0xf2, 0xdd, 0x33, 0xa0, 0x4e, 0x61, 0x8f, 0x3f, 0xd1, 0xfe, 0x10, 0xc5, 0x2b, 0x4, 0xea, 0x5a, 0xb4, 0x9b, 0x75, 0xe6, 0x8, 0x27, 0xc9, 0x79, 0x97, 0xb8, 0x56},
+ {0x0, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x7, 0xb0, 0x5f, 0x73, 0x9c, 0x56, 0xb9, 0x95, 0x7a, 0xcd, 0x22, 0xe, 0xe1, 0x7d, 0x92, 0xbe, 0x51, 0xe6, 0x9, 0x25, 0xca, 0xac, 0x43, 0x6f, 0x80, 0x37, 0xd8, 0xf4, 0x1b, 0x87, 0x68, 0x44, 0xab, 0x1c, 0xf3, 0xdf, 0x30, 0xfa, 0x15, 0x39, 0xd6, 0x61, 0x8e, 0xa2, 0x4d, 0xd1, 0x3e, 0x12, 0xfd, 0x4a, 0xa5, 0x89, 0x66, 0x45, 0xaa, 0x86, 0x69, 0xde, 0x31, 0x1d, 0xf2, 0x6e, 0x81, 0xad, 0x42, 0xf5, 0x1a, 0x36, 0xd9, 0x13, 0xfc, 0xd0, 0x3f, 0x88, 0x67, 0x4b, 0xa4, 0x38, 0xd7, 0xfb, 0x14, 0xa3, 0x4c, 0x60, 0x8f, 0xe9, 0x6, 0x2a, 0xc5, 0x72, 0x9d, 0xb1, 0x5e, 0xc2, 0x2d, 0x1, 0xee, 0x59, 0xb6, 0x9a, 0x75, 0xbf, 0x50, 0x7c, 0x93, 0x24, 0xcb, 0xe7, 0x8, 0x94, 0x7b, 0x57, 0xb8, 0xf, 0xe0, 0xcc, 0x23, 0x8a, 0x65, 0x49, 0xa6, 0x11, 0xfe, 0xd2, 0x3d, 0xa1, 0x4e, 0x62, 0x8d, 0x3a, 0xd5, 0xf9, 0x16, 0xdc, 0x33, 0x1f, 0xf0, 0x47, 0xa8, 0x84, 0x6b, 0xf7, 0x18, 0x34, 0xdb, 0x6c, 0x83, 0xaf, 0x40, 0x26, 0xc9, 0xe5, 0xa, 0xbd, 0x52, 0x7e, 0x91, 0xd, 0xe2, 0xce, 0x21, 0x96, 0x79, 0x55, 0xba, 0x70, 0x9f, 0xb3, 0x5c, 0xeb, 0x4, 0x28, 0xc7, 0x5b, 0xb4, 0x98, 0x77, 0xc0, 0x2f, 0x3, 0xec, 0xcf, 0x20, 0xc, 0xe3, 0x54, 0xbb, 0x97, 0x78, 0xe4, 0xb, 0x27, 0xc8, 0x7f, 0x90, 0xbc, 0x53, 0x99, 0x76, 0x5a, 0xb5, 0x2, 0xed, 0xc1, 0x2e, 0xb2, 0x5d, 0x71, 0x9e, 0x29, 0xc6, 0xea, 0x5, 0x63, 0x8c, 0xa0, 0x4f, 0xf8, 0x17, 0x3b, 0xd4, 0x48, 0xa7, 0x8b, 0x64, 0xd3, 0x3c, 0x10, 0xff, 0x35, 0xda, 0xf6, 0x19, 0xae, 0x41, 0x6d, 0x82, 0x1e, 0xf1, 0xdd, 0x32, 0x85, 0x6a, 0x46, 0xa9},
+ {0x0, 0xf0, 0xfd, 0xd, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39, 0xbb, 0x4b, 0x46, 0xb6, 0x5c, 0xac, 0xa1, 0x51, 0x68, 0x98, 0x95, 0x65, 0x8f, 0x7f, 0x72, 0x82, 0x6b, 0x9b, 0x96, 0x66, 0x8c, 0x7c, 0x71, 0x81, 0xb8, 0x48, 0x45, 0xb5, 0x5f, 0xaf, 0xa2, 0x52, 0xd0, 0x20, 0x2d, 0xdd, 0x37, 0xc7, 0xca, 0x3a, 0x3, 0xf3, 0xfe, 0xe, 0xe4, 0x14, 0x19, 0xe9, 0xd6, 0x26, 0x2b, 0xdb, 0x31, 0xc1, 0xcc, 0x3c, 0x5, 0xf5, 0xf8, 0x8, 0xe2, 0x12, 0x1f, 0xef, 0x6d, 0x9d, 0x90, 0x60, 0x8a, 0x7a, 0x77, 0x87, 0xbe, 0x4e, 0x43, 0xb3, 0x59, 0xa9, 0xa4, 0x54, 0xbd, 0x4d, 0x40, 0xb0, 0x5a, 0xaa, 0xa7, 0x57, 0x6e, 0x9e, 0x93, 0x63, 0x89, 0x79, 0x74, 0x84, 0x6, 0xf6, 0xfb, 0xb, 0xe1, 0x11, 0x1c, 0xec, 0xd5, 0x25, 0x28, 0xd8, 0x32, 0xc2, 0xcf, 0x3f, 0xb1, 0x41, 0x4c, 0xbc, 0x56, 0xa6, 0xab, 0x5b, 0x62, 0x92, 0x9f, 0x6f, 0x85, 0x75, 0x78, 0x88, 0xa, 0xfa, 0xf7, 0x7, 0xed, 0x1d, 0x10, 0xe0, 0xd9, 0x29, 0x24, 0xd4, 0x3e, 0xce, 0xc3, 0x33, 0xda, 0x2a, 0x27, 0xd7, 0x3d, 0xcd, 0xc0, 0x30, 0x9, 0xf9, 0xf4, 0x4, 0xee, 0x1e, 0x13, 0xe3, 0x61, 0x91, 0x9c, 0x6c, 0x86, 0x76, 0x7b, 0x8b, 0xb2, 0x42, 0x4f, 0xbf, 0x55, 0xa5, 0xa8, 0x58, 0x67, 0x97, 0x9a, 0x6a, 0x80, 0x70, 0x7d, 0x8d, 0xb4, 0x44, 0x49, 0xb9, 0x53, 0xa3, 0xae, 0x5e, 0xdc, 0x2c, 0x21, 0xd1, 0x3b, 0xcb, 0xc6, 0x36, 0xf, 0xff, 0xf2, 0x2, 0xe8, 0x18, 0x15, 0xe5, 0xc, 0xfc, 0xf1, 0x1, 0xeb, 0x1b, 0x16, 0xe6, 0xdf, 0x2f, 0x22, 0xd2, 0x38, 0xc8, 0xc5, 0x35, 0xb7, 0x47, 0x4a, 0xba, 0x50, 0xa0, 0xad, 0x5d, 0x64, 0x94, 0x99, 0x69, 0x83, 0x73, 0x7e, 0x8e},
+ {0x0, 0xf1, 0xff, 0xe, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36, 0xab, 0x5a, 0x54, 0xa5, 0x48, 0xb9, 0xb7, 0x46, 0x70, 0x81, 0x8f, 0x7e, 0x93, 0x62, 0x6c, 0x9d, 0x4b, 0xba, 0xb4, 0x45, 0xa8, 0x59, 0x57, 0xa6, 0x90, 0x61, 0x6f, 0x9e, 0x73, 0x82, 0x8c, 0x7d, 0xe0, 0x11, 0x1f, 0xee, 0x3, 0xf2, 0xfc, 0xd, 0x3b, 0xca, 0xc4, 0x35, 0xd8, 0x29, 0x27, 0xd6, 0x96, 0x67, 0x69, 0x98, 0x75, 0x84, 0x8a, 0x7b, 0x4d, 0xbc, 0xb2, 0x43, 0xae, 0x5f, 0x51, 0xa0, 0x3d, 0xcc, 0xc2, 0x33, 0xde, 0x2f, 0x21, 0xd0, 0xe6, 0x17, 0x19, 0xe8, 0x5, 0xf4, 0xfa, 0xb, 0xdd, 0x2c, 0x22, 0xd3, 0x3e, 0xcf, 0xc1, 0x30, 0x6, 0xf7, 0xf9, 0x8, 0xe5, 0x14, 0x1a, 0xeb, 0x76, 0x87, 0x89, 0x78, 0x95, 0x64, 0x6a, 0x9b, 0xad, 0x5c, 0x52, 0xa3, 0x4e, 0xbf, 0xb1, 0x40, 0x31, 0xc0, 0xce, 0x3f, 0xd2, 0x23, 0x2d, 0xdc, 0xea, 0x1b, 0x15, 0xe4, 0x9, 0xf8, 0xf6, 0x7, 0x9a, 0x6b, 0x65, 0x94, 0x79, 0x88, 0x86, 0x77, 0x41, 0xb0, 0xbe, 0x4f, 0xa2, 0x53, 0x5d, 0xac, 0x7a, 0x8b, 0x85, 0x74, 0x99, 0x68, 0x66, 0x97, 0xa1, 0x50, 0x5e, 0xaf, 0x42, 0xb3, 0xbd, 0x4c, 0xd1, 0x20, 0x2e, 0xdf, 0x32, 0xc3, 0xcd, 0x3c, 0xa, 0xfb, 0xf5, 0x4, 0xe9, 0x18, 0x16, 0xe7, 0xa7, 0x56, 0x58, 0xa9, 0x44, 0xb5, 0xbb, 0x4a, 0x7c, 0x8d, 0x83, 0x72, 0x9f, 0x6e, 0x60, 0x91, 0xc, 0xfd, 0xf3, 0x2, 0xef, 0x1e, 0x10, 0xe1, 0xd7, 0x26, 0x28, 0xd9, 0x34, 0xc5, 0xcb, 0x3a, 0xec, 0x1d, 0x13, 0xe2, 0xf, 0xfe, 0xf0, 0x1, 0x37, 0xc6, 0xc8, 0x39, 0xd4, 0x25, 0x2b, 0xda, 0x47, 0xb6, 0xb8, 0x49, 0xa4, 0x55, 0x5b, 0xaa, 0x9c, 0x6d, 0x63, 0x92, 0x7f, 0x8e, 0x80, 0x71},
+ {0x0, 0xf2, 0xf9, 0xb, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27, 0x9b, 0x69, 0x62, 0x90, 0x74, 0x86, 0x8d, 0x7f, 0x58, 0xaa, 0xa1, 0x53, 0xb7, 0x45, 0x4e, 0xbc, 0x2b, 0xd9, 0xd2, 0x20, 0xc4, 0x36, 0x3d, 0xcf, 0xe8, 0x1a, 0x11, 0xe3, 0x7, 0xf5, 0xfe, 0xc, 0xb0, 0x42, 0x49, 0xbb, 0x5f, 0xad, 0xa6, 0x54, 0x73, 0x81, 0x8a, 0x78, 0x9c, 0x6e, 0x65, 0x97, 0x56, 0xa4, 0xaf, 0x5d, 0xb9, 0x4b, 0x40, 0xb2, 0x95, 0x67, 0x6c, 0x9e, 0x7a, 0x88, 0x83, 0x71, 0xcd, 0x3f, 0x34, 0xc6, 0x22, 0xd0, 0xdb, 0x29, 0xe, 0xfc, 0xf7, 0x5, 0xe1, 0x13, 0x18, 0xea, 0x7d, 0x8f, 0x84, 0x76, 0x92, 0x60, 0x6b, 0x99, 0xbe, 0x4c, 0x47, 0xb5, 0x51, 0xa3, 0xa8, 0x5a, 0xe6, 0x14, 0x1f, 0xed, 0x9, 0xfb, 0xf0, 0x2, 0x25, 0xd7, 0xdc, 0x2e, 0xca, 0x38, 0x33, 0xc1, 0xac, 0x5e, 0x55, 0xa7, 0x43, 0xb1, 0xba, 0x48, 0x6f, 0x9d, 0x96, 0x64, 0x80, 0x72, 0x79, 0x8b, 0x37, 0xc5, 0xce, 0x3c, 0xd8, 0x2a, 0x21, 0xd3, 0xf4, 0x6, 0xd, 0xff, 0x1b, 0xe9, 0xe2, 0x10, 0x87, 0x75, 0x7e, 0x8c, 0x68, 0x9a, 0x91, 0x63, 0x44, 0xb6, 0xbd, 0x4f, 0xab, 0x59, 0x52, 0xa0, 0x1c, 0xee, 0xe5, 0x17, 0xf3, 0x1, 0xa, 0xf8, 0xdf, 0x2d, 0x26, 0xd4, 0x30, 0xc2, 0xc9, 0x3b, 0xfa, 0x8, 0x3, 0xf1, 0x15, 0xe7, 0xec, 0x1e, 0x39, 0xcb, 0xc0, 0x32, 0xd6, 0x24, 0x2f, 0xdd, 0x61, 0x93, 0x98, 0x6a, 0x8e, 0x7c, 0x77, 0x85, 0xa2, 0x50, 0x5b, 0xa9, 0x4d, 0xbf, 0xb4, 0x46, 0xd1, 0x23, 0x28, 0xda, 0x3e, 0xcc, 0xc7, 0x35, 0x12, 0xe0, 0xeb, 0x19, 0xfd, 0xf, 0x4, 0xf6, 0x4a, 0xb8, 0xb3, 0x41, 0xa5, 0x57, 0x5c, 0xae, 0x89, 0x7b, 0x70, 0x82, 0x66, 0x94, 0x9f, 0x6d},
+ {0x0, 0xf3, 0xfb, 0x8, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28, 0x8b, 0x78, 0x70, 0x83, 0x60, 0x93, 0x9b, 0x68, 0x40, 0xb3, 0xbb, 0x48, 0xab, 0x58, 0x50, 0xa3, 0xb, 0xf8, 0xf0, 0x3, 0xe0, 0x13, 0x1b, 0xe8, 0xc0, 0x33, 0x3b, 0xc8, 0x2b, 0xd8, 0xd0, 0x23, 0x80, 0x73, 0x7b, 0x88, 0x6b, 0x98, 0x90, 0x63, 0x4b, 0xb8, 0xb0, 0x43, 0xa0, 0x53, 0x5b, 0xa8, 0x16, 0xe5, 0xed, 0x1e, 0xfd, 0xe, 0x6, 0xf5, 0xdd, 0x2e, 0x26, 0xd5, 0x36, 0xc5, 0xcd, 0x3e, 0x9d, 0x6e, 0x66, 0x95, 0x76, 0x85, 0x8d, 0x7e, 0x56, 0xa5, 0xad, 0x5e, 0xbd, 0x4e, 0x46, 0xb5, 0x1d, 0xee, 0xe6, 0x15, 0xf6, 0x5, 0xd, 0xfe, 0xd6, 0x25, 0x2d, 0xde, 0x3d, 0xce, 0xc6, 0x35, 0x96, 0x65, 0x6d, 0x9e, 0x7d, 0x8e, 0x86, 0x75, 0x5d, 0xae, 0xa6, 0x55, 0xb6, 0x45, 0x4d, 0xbe, 0x2c, 0xdf, 0xd7, 0x24, 0xc7, 0x34, 0x3c, 0xcf, 0xe7, 0x14, 0x1c, 0xef, 0xc, 0xff, 0xf7, 0x4, 0xa7, 0x54, 0x5c, 0xaf, 0x4c, 0xbf, 0xb7, 0x44, 0x6c, 0x9f, 0x97, 0x64, 0x87, 0x74, 0x7c, 0x8f, 0x27, 0xd4, 0xdc, 0x2f, 0xcc, 0x3f, 0x37, 0xc4, 0xec, 0x1f, 0x17, 0xe4, 0x7, 0xf4, 0xfc, 0xf, 0xac, 0x5f, 0x57, 0xa4, 0x47, 0xb4, 0xbc, 0x4f, 0x67, 0x94, 0x9c, 0x6f, 0x8c, 0x7f, 0x77, 0x84, 0x3a, 0xc9, 0xc1, 0x32, 0xd1, 0x22, 0x2a, 0xd9, 0xf1, 0x2, 0xa, 0xf9, 0x1a, 0xe9, 0xe1, 0x12, 0xb1, 0x42, 0x4a, 0xb9, 0x5a, 0xa9, 0xa1, 0x52, 0x7a, 0x89, 0x81, 0x72, 0x91, 0x62, 0x6a, 0x99, 0x31, 0xc2, 0xca, 0x39, 0xda, 0x29, 0x21, 0xd2, 0xfa, 0x9, 0x1, 0xf2, 0x11, 0xe2, 0xea, 0x19, 0xba, 0x49, 0x41, 0xb2, 0x51, 0xa2, 0xaa, 0x59, 0x71, 0x82, 0x8a, 0x79, 0x9a, 0x69, 0x61, 0x92},
+ {0x0, 0xf4, 0xf5, 0x1, 0xf7, 0x3, 0x2, 0xf6, 0xf3, 0x7, 0x6, 0xf2, 0x4, 0xf0, 0xf1, 0x5, 0xfb, 0xf, 0xe, 0xfa, 0xc, 0xf8, 0xf9, 0xd, 0x8, 0xfc, 0xfd, 0x9, 0xff, 0xb, 0xa, 0xfe, 0xeb, 0x1f, 0x1e, 0xea, 0x1c, 0xe8, 0xe9, 0x1d, 0x18, 0xec, 0xed, 0x19, 0xef, 0x1b, 0x1a, 0xee, 0x10, 0xe4, 0xe5, 0x11, 0xe7, 0x13, 0x12, 0xe6, 0xe3, 0x17, 0x16, 0xe2, 0x14, 0xe0, 0xe1, 0x15, 0xcb, 0x3f, 0x3e, 0xca, 0x3c, 0xc8, 0xc9, 0x3d, 0x38, 0xcc, 0xcd, 0x39, 0xcf, 0x3b, 0x3a, 0xce, 0x30, 0xc4, 0xc5, 0x31, 0xc7, 0x33, 0x32, 0xc6, 0xc3, 0x37, 0x36, 0xc2, 0x34, 0xc0, 0xc1, 0x35, 0x20, 0xd4, 0xd5, 0x21, 0xd7, 0x23, 0x22, 0xd6, 0xd3, 0x27, 0x26, 0xd2, 0x24, 0xd0, 0xd1, 0x25, 0xdb, 0x2f, 0x2e, 0xda, 0x2c, 0xd8, 0xd9, 0x2d, 0x28, 0xdc, 0xdd, 0x29, 0xdf, 0x2b, 0x2a, 0xde, 0x8b, 0x7f, 0x7e, 0x8a, 0x7c, 0x88, 0x89, 0x7d, 0x78, 0x8c, 0x8d, 0x79, 0x8f, 0x7b, 0x7a, 0x8e, 0x70, 0x84, 0x85, 0x71, 0x87, 0x73, 0x72, 0x86, 0x83, 0x77, 0x76, 0x82, 0x74, 0x80, 0x81, 0x75, 0x60, 0x94, 0x95, 0x61, 0x97, 0x63, 0x62, 0x96, 0x93, 0x67, 0x66, 0x92, 0x64, 0x90, 0x91, 0x65, 0x9b, 0x6f, 0x6e, 0x9a, 0x6c, 0x98, 0x99, 0x6d, 0x68, 0x9c, 0x9d, 0x69, 0x9f, 0x6b, 0x6a, 0x9e, 0x40, 0xb4, 0xb5, 0x41, 0xb7, 0x43, 0x42, 0xb6, 0xb3, 0x47, 0x46, 0xb2, 0x44, 0xb0, 0xb1, 0x45, 0xbb, 0x4f, 0x4e, 0xba, 0x4c, 0xb8, 0xb9, 0x4d, 0x48, 0xbc, 0xbd, 0x49, 0xbf, 0x4b, 0x4a, 0xbe, 0xab, 0x5f, 0x5e, 0xaa, 0x5c, 0xa8, 0xa9, 0x5d, 0x58, 0xac, 0xad, 0x59, 0xaf, 0x5b, 0x5a, 0xae, 0x50, 0xa4, 0xa5, 0x51, 0xa7, 0x53, 0x52, 0xa6, 0xa3, 0x57, 0x56, 0xa2, 0x54, 0xa0, 0xa1, 0x55},
+ {0x0, 0xf5, 0xf7, 0x2, 0xf3, 0x6, 0x4, 0xf1, 0xfb, 0xe, 0xc, 0xf9, 0x8, 0xfd, 0xff, 0xa, 0xeb, 0x1e, 0x1c, 0xe9, 0x18, 0xed, 0xef, 0x1a, 0x10, 0xe5, 0xe7, 0x12, 0xe3, 0x16, 0x14, 0xe1, 0xcb, 0x3e, 0x3c, 0xc9, 0x38, 0xcd, 0xcf, 0x3a, 0x30, 0xc5, 0xc7, 0x32, 0xc3, 0x36, 0x34, 0xc1, 0x20, 0xd5, 0xd7, 0x22, 0xd3, 0x26, 0x24, 0xd1, 0xdb, 0x2e, 0x2c, 0xd9, 0x28, 0xdd, 0xdf, 0x2a, 0x8b, 0x7e, 0x7c, 0x89, 0x78, 0x8d, 0x8f, 0x7a, 0x70, 0x85, 0x87, 0x72, 0x83, 0x76, 0x74, 0x81, 0x60, 0x95, 0x97, 0x62, 0x93, 0x66, 0x64, 0x91, 0x9b, 0x6e, 0x6c, 0x99, 0x68, 0x9d, 0x9f, 0x6a, 0x40, 0xb5, 0xb7, 0x42, 0xb3, 0x46, 0x44, 0xb1, 0xbb, 0x4e, 0x4c, 0xb9, 0x48, 0xbd, 0xbf, 0x4a, 0xab, 0x5e, 0x5c, 0xa9, 0x58, 0xad, 0xaf, 0x5a, 0x50, 0xa5, 0xa7, 0x52, 0xa3, 0x56, 0x54, 0xa1, 0xb, 0xfe, 0xfc, 0x9, 0xf8, 0xd, 0xf, 0xfa, 0xf0, 0x5, 0x7, 0xf2, 0x3, 0xf6, 0xf4, 0x1, 0xe0, 0x15, 0x17, 0xe2, 0x13, 0xe6, 0xe4, 0x11, 0x1b, 0xee, 0xec, 0x19, 0xe8, 0x1d, 0x1f, 0xea, 0xc0, 0x35, 0x37, 0xc2, 0x33, 0xc6, 0xc4, 0x31, 0x3b, 0xce, 0xcc, 0x39, 0xc8, 0x3d, 0x3f, 0xca, 0x2b, 0xde, 0xdc, 0x29, 0xd8, 0x2d, 0x2f, 0xda, 0xd0, 0x25, 0x27, 0xd2, 0x23, 0xd6, 0xd4, 0x21, 0x80, 0x75, 0x77, 0x82, 0x73, 0x86, 0x84, 0x71, 0x7b, 0x8e, 0x8c, 0x79, 0x88, 0x7d, 0x7f, 0x8a, 0x6b, 0x9e, 0x9c, 0x69, 0x98, 0x6d, 0x6f, 0x9a, 0x90, 0x65, 0x67, 0x92, 0x63, 0x96, 0x94, 0x61, 0x4b, 0xbe, 0xbc, 0x49, 0xb8, 0x4d, 0x4f, 0xba, 0xb0, 0x45, 0x47, 0xb2, 0x43, 0xb6, 0xb4, 0x41, 0xa0, 0x55, 0x57, 0xa2, 0x53, 0xa6, 0xa4, 0x51, 0x5b, 0xae, 0xac, 0x59, 0xa8, 0x5d, 0x5f, 0xaa},
+ {0x0, 0xf6, 0xf1, 0x7, 0xff, 0x9, 0xe, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b, 0xdb, 0x2d, 0x2a, 0xdc, 0x24, 0xd2, 0xd5, 0x23, 0x38, 0xce, 0xc9, 0x3f, 0xc7, 0x31, 0x36, 0xc0, 0xab, 0x5d, 0x5a, 0xac, 0x54, 0xa2, 0xa5, 0x53, 0x48, 0xbe, 0xb9, 0x4f, 0xb7, 0x41, 0x46, 0xb0, 0x70, 0x86, 0x81, 0x77, 0x8f, 0x79, 0x7e, 0x88, 0x93, 0x65, 0x62, 0x94, 0x6c, 0x9a, 0x9d, 0x6b, 0x4b, 0xbd, 0xba, 0x4c, 0xb4, 0x42, 0x45, 0xb3, 0xa8, 0x5e, 0x59, 0xaf, 0x57, 0xa1, 0xa6, 0x50, 0x90, 0x66, 0x61, 0x97, 0x6f, 0x99, 0x9e, 0x68, 0x73, 0x85, 0x82, 0x74, 0x8c, 0x7a, 0x7d, 0x8b, 0xe0, 0x16, 0x11, 0xe7, 0x1f, 0xe9, 0xee, 0x18, 0x3, 0xf5, 0xf2, 0x4, 0xfc, 0xa, 0xd, 0xfb, 0x3b, 0xcd, 0xca, 0x3c, 0xc4, 0x32, 0x35, 0xc3, 0xd8, 0x2e, 0x29, 0xdf, 0x27, 0xd1, 0xd6, 0x20, 0x96, 0x60, 0x67, 0x91, 0x69, 0x9f, 0x98, 0x6e, 0x75, 0x83, 0x84, 0x72, 0x8a, 0x7c, 0x7b, 0x8d, 0x4d, 0xbb, 0xbc, 0x4a, 0xb2, 0x44, 0x43, 0xb5, 0xae, 0x58, 0x5f, 0xa9, 0x51, 0xa7, 0xa0, 0x56, 0x3d, 0xcb, 0xcc, 0x3a, 0xc2, 0x34, 0x33, 0xc5, 0xde, 0x28, 0x2f, 0xd9, 0x21, 0xd7, 0xd0, 0x26, 0xe6, 0x10, 0x17, 0xe1, 0x19, 0xef, 0xe8, 0x1e, 0x5, 0xf3, 0xf4, 0x2, 0xfa, 0xc, 0xb, 0xfd, 0xdd, 0x2b, 0x2c, 0xda, 0x22, 0xd4, 0xd3, 0x25, 0x3e, 0xc8, 0xcf, 0x39, 0xc1, 0x37, 0x30, 0xc6, 0x6, 0xf0, 0xf7, 0x1, 0xf9, 0xf, 0x8, 0xfe, 0xe5, 0x13, 0x14, 0xe2, 0x1a, 0xec, 0xeb, 0x1d, 0x76, 0x80, 0x87, 0x71, 0x89, 0x7f, 0x78, 0x8e, 0x95, 0x63, 0x64, 0x92, 0x6a, 0x9c, 0x9b, 0x6d, 0xad, 0x5b, 0x5c, 0xaa, 0x52, 0xa4, 0xa3, 0x55, 0x4e, 0xb8, 0xbf, 0x49, 0xb1, 0x47, 0x40, 0xb6},
+ {0x0, 0xf7, 0xf3, 0x4, 0xfb, 0xc, 0x8, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14, 0xcb, 0x3c, 0x38, 0xcf, 0x30, 0xc7, 0xc3, 0x34, 0x20, 0xd7, 0xd3, 0x24, 0xdb, 0x2c, 0x28, 0xdf, 0x8b, 0x7c, 0x78, 0x8f, 0x70, 0x87, 0x83, 0x74, 0x60, 0x97, 0x93, 0x64, 0x9b, 0x6c, 0x68, 0x9f, 0x40, 0xb7, 0xb3, 0x44, 0xbb, 0x4c, 0x48, 0xbf, 0xab, 0x5c, 0x58, 0xaf, 0x50, 0xa7, 0xa3, 0x54, 0xb, 0xfc, 0xf8, 0xf, 0xf0, 0x7, 0x3, 0xf4, 0xe0, 0x17, 0x13, 0xe4, 0x1b, 0xec, 0xe8, 0x1f, 0xc0, 0x37, 0x33, 0xc4, 0x3b, 0xcc, 0xc8, 0x3f, 0x2b, 0xdc, 0xd8, 0x2f, 0xd0, 0x27, 0x23, 0xd4, 0x80, 0x77, 0x73, 0x84, 0x7b, 0x8c, 0x88, 0x7f, 0x6b, 0x9c, 0x98, 0x6f, 0x90, 0x67, 0x63, 0x94, 0x4b, 0xbc, 0xb8, 0x4f, 0xb0, 0x47, 0x43, 0xb4, 0xa0, 0x57, 0x53, 0xa4, 0x5b, 0xac, 0xa8, 0x5f, 0x16, 0xe1, 0xe5, 0x12, 0xed, 0x1a, 0x1e, 0xe9, 0xfd, 0xa, 0xe, 0xf9, 0x6, 0xf1, 0xf5, 0x2, 0xdd, 0x2a, 0x2e, 0xd9, 0x26, 0xd1, 0xd5, 0x22, 0x36, 0xc1, 0xc5, 0x32, 0xcd, 0x3a, 0x3e, 0xc9, 0x9d, 0x6a, 0x6e, 0x99, 0x66, 0x91, 0x95, 0x62, 0x76, 0x81, 0x85, 0x72, 0x8d, 0x7a, 0x7e, 0x89, 0x56, 0xa1, 0xa5, 0x52, 0xad, 0x5a, 0x5e, 0xa9, 0xbd, 0x4a, 0x4e, 0xb9, 0x46, 0xb1, 0xb5, 0x42, 0x1d, 0xea, 0xee, 0x19, 0xe6, 0x11, 0x15, 0xe2, 0xf6, 0x1, 0x5, 0xf2, 0xd, 0xfa, 0xfe, 0x9, 0xd6, 0x21, 0x25, 0xd2, 0x2d, 0xda, 0xde, 0x29, 0x3d, 0xca, 0xce, 0x39, 0xc6, 0x31, 0x35, 0xc2, 0x96, 0x61, 0x65, 0x92, 0x6d, 0x9a, 0x9e, 0x69, 0x7d, 0x8a, 0x8e, 0x79, 0x86, 0x71, 0x75, 0x82, 0x5d, 0xaa, 0xae, 0x59, 0xa6, 0x51, 0x55, 0xa2, 0xb6, 0x41, 0x45, 0xb2, 0x4d, 0xba, 0xbe, 0x49},
+ {0x0, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41, 0x3b, 0xc3, 0xd6, 0x2e, 0xfc, 0x4, 0x11, 0xe9, 0xa8, 0x50, 0x45, 0xbd, 0x6f, 0x97, 0x82, 0x7a, 0x76, 0x8e, 0x9b, 0x63, 0xb1, 0x49, 0x5c, 0xa4, 0xe5, 0x1d, 0x8, 0xf0, 0x22, 0xda, 0xcf, 0x37, 0x4d, 0xb5, 0xa0, 0x58, 0x8a, 0x72, 0x67, 0x9f, 0xde, 0x26, 0x33, 0xcb, 0x19, 0xe1, 0xf4, 0xc, 0xec, 0x14, 0x1, 0xf9, 0x2b, 0xd3, 0xc6, 0x3e, 0x7f, 0x87, 0x92, 0x6a, 0xb8, 0x40, 0x55, 0xad, 0xd7, 0x2f, 0x3a, 0xc2, 0x10, 0xe8, 0xfd, 0x5, 0x44, 0xbc, 0xa9, 0x51, 0x83, 0x7b, 0x6e, 0x96, 0x9a, 0x62, 0x77, 0x8f, 0x5d, 0xa5, 0xb0, 0x48, 0x9, 0xf1, 0xe4, 0x1c, 0xce, 0x36, 0x23, 0xdb, 0xa1, 0x59, 0x4c, 0xb4, 0x66, 0x9e, 0x8b, 0x73, 0x32, 0xca, 0xdf, 0x27, 0xf5, 0xd, 0x18, 0xe0, 0xc5, 0x3d, 0x28, 0xd0, 0x2, 0xfa, 0xef, 0x17, 0x56, 0xae, 0xbb, 0x43, 0x91, 0x69, 0x7c, 0x84, 0xfe, 0x6, 0x13, 0xeb, 0x39, 0xc1, 0xd4, 0x2c, 0x6d, 0x95, 0x80, 0x78, 0xaa, 0x52, 0x47, 0xbf, 0xb3, 0x4b, 0x5e, 0xa6, 0x74, 0x8c, 0x99, 0x61, 0x20, 0xd8, 0xcd, 0x35, 0xe7, 0x1f, 0xa, 0xf2, 0x88, 0x70, 0x65, 0x9d, 0x4f, 0xb7, 0xa2, 0x5a, 0x1b, 0xe3, 0xf6, 0xe, 0xdc, 0x24, 0x31, 0xc9, 0x29, 0xd1, 0xc4, 0x3c, 0xee, 0x16, 0x3, 0xfb, 0xba, 0x42, 0x57, 0xaf, 0x7d, 0x85, 0x90, 0x68, 0x12, 0xea, 0xff, 0x7, 0xd5, 0x2d, 0x38, 0xc0, 0x81, 0x79, 0x6c, 0x94, 0x46, 0xbe, 0xab, 0x53, 0x5f, 0xa7, 0xb2, 0x4a, 0x98, 0x60, 0x75, 0x8d, 0xcc, 0x34, 0x21, 0xd9, 0xb, 0xf3, 0xe6, 0x1e, 0x64, 0x9c, 0x89, 0x71, 0xa3, 0x5b, 0x4e, 0xb6, 0xf7, 0xf, 0x1a, 0xe2, 0x30, 0xc8, 0xdd, 0x25},
+ {0x0, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e, 0x2b, 0xd2, 0xc4, 0x3d, 0xe8, 0x11, 0x7, 0xfe, 0xb0, 0x49, 0x5f, 0xa6, 0x73, 0x8a, 0x9c, 0x65, 0x56, 0xaf, 0xb9, 0x40, 0x95, 0x6c, 0x7a, 0x83, 0xcd, 0x34, 0x22, 0xdb, 0xe, 0xf7, 0xe1, 0x18, 0x7d, 0x84, 0x92, 0x6b, 0xbe, 0x47, 0x51, 0xa8, 0xe6, 0x1f, 0x9, 0xf0, 0x25, 0xdc, 0xca, 0x33, 0xac, 0x55, 0x43, 0xba, 0x6f, 0x96, 0x80, 0x79, 0x37, 0xce, 0xd8, 0x21, 0xf4, 0xd, 0x1b, 0xe2, 0x87, 0x7e, 0x68, 0x91, 0x44, 0xbd, 0xab, 0x52, 0x1c, 0xe5, 0xf3, 0xa, 0xdf, 0x26, 0x30, 0xc9, 0xfa, 0x3, 0x15, 0xec, 0x39, 0xc0, 0xd6, 0x2f, 0x61, 0x98, 0x8e, 0x77, 0xa2, 0x5b, 0x4d, 0xb4, 0xd1, 0x28, 0x3e, 0xc7, 0x12, 0xeb, 0xfd, 0x4, 0x4a, 0xb3, 0xa5, 0x5c, 0x89, 0x70, 0x66, 0x9f, 0x45, 0xbc, 0xaa, 0x53, 0x86, 0x7f, 0x69, 0x90, 0xde, 0x27, 0x31, 0xc8, 0x1d, 0xe4, 0xf2, 0xb, 0x6e, 0x97, 0x81, 0x78, 0xad, 0x54, 0x42, 0xbb, 0xf5, 0xc, 0x1a, 0xe3, 0x36, 0xcf, 0xd9, 0x20, 0x13, 0xea, 0xfc, 0x5, 0xd0, 0x29, 0x3f, 0xc6, 0x88, 0x71, 0x67, 0x9e, 0x4b, 0xb2, 0xa4, 0x5d, 0x38, 0xc1, 0xd7, 0x2e, 0xfb, 0x2, 0x14, 0xed, 0xa3, 0x5a, 0x4c, 0xb5, 0x60, 0x99, 0x8f, 0x76, 0xe9, 0x10, 0x6, 0xff, 0x2a, 0xd3, 0xc5, 0x3c, 0x72, 0x8b, 0x9d, 0x64, 0xb1, 0x48, 0x5e, 0xa7, 0xc2, 0x3b, 0x2d, 0xd4, 0x1, 0xf8, 0xee, 0x17, 0x59, 0xa0, 0xb6, 0x4f, 0x9a, 0x63, 0x75, 0x8c, 0xbf, 0x46, 0x50, 0xa9, 0x7c, 0x85, 0x93, 0x6a, 0x24, 0xdd, 0xcb, 0x32, 0xe7, 0x1e, 0x8, 0xf1, 0x94, 0x6d, 0x7b, 0x82, 0x57, 0xae, 0xb8, 0x41, 0xf, 0xf6, 0xe0, 0x19, 0xcc, 0x35, 0x23, 0xda},
+ {0x0, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f, 0x1b, 0xe1, 0xf2, 0x8, 0xd4, 0x2e, 0x3d, 0xc7, 0x98, 0x62, 0x71, 0x8b, 0x57, 0xad, 0xbe, 0x44, 0x36, 0xcc, 0xdf, 0x25, 0xf9, 0x3, 0x10, 0xea, 0xb5, 0x4f, 0x5c, 0xa6, 0x7a, 0x80, 0x93, 0x69, 0x2d, 0xd7, 0xc4, 0x3e, 0xe2, 0x18, 0xb, 0xf1, 0xae, 0x54, 0x47, 0xbd, 0x61, 0x9b, 0x88, 0x72, 0x6c, 0x96, 0x85, 0x7f, 0xa3, 0x59, 0x4a, 0xb0, 0xef, 0x15, 0x6, 0xfc, 0x20, 0xda, 0xc9, 0x33, 0x77, 0x8d, 0x9e, 0x64, 0xb8, 0x42, 0x51, 0xab, 0xf4, 0xe, 0x1d, 0xe7, 0x3b, 0xc1, 0xd2, 0x28, 0x5a, 0xa0, 0xb3, 0x49, 0x95, 0x6f, 0x7c, 0x86, 0xd9, 0x23, 0x30, 0xca, 0x16, 0xec, 0xff, 0x5, 0x41, 0xbb, 0xa8, 0x52, 0x8e, 0x74, 0x67, 0x9d, 0xc2, 0x38, 0x2b, 0xd1, 0xd, 0xf7, 0xe4, 0x1e, 0xd8, 0x22, 0x31, 0xcb, 0x17, 0xed, 0xfe, 0x4, 0x5b, 0xa1, 0xb2, 0x48, 0x94, 0x6e, 0x7d, 0x87, 0xc3, 0x39, 0x2a, 0xd0, 0xc, 0xf6, 0xe5, 0x1f, 0x40, 0xba, 0xa9, 0x53, 0x8f, 0x75, 0x66, 0x9c, 0xee, 0x14, 0x7, 0xfd, 0x21, 0xdb, 0xc8, 0x32, 0x6d, 0x97, 0x84, 0x7e, 0xa2, 0x58, 0x4b, 0xb1, 0xf5, 0xf, 0x1c, 0xe6, 0x3a, 0xc0, 0xd3, 0x29, 0x76, 0x8c, 0x9f, 0x65, 0xb9, 0x43, 0x50, 0xaa, 0xb4, 0x4e, 0x5d, 0xa7, 0x7b, 0x81, 0x92, 0x68, 0x37, 0xcd, 0xde, 0x24, 0xf8, 0x2, 0x11, 0xeb, 0xaf, 0x55, 0x46, 0xbc, 0x60, 0x9a, 0x89, 0x73, 0x2c, 0xd6, 0xc5, 0x3f, 0xe3, 0x19, 0xa, 0xf0, 0x82, 0x78, 0x6b, 0x91, 0x4d, 0xb7, 0xa4, 0x5e, 0x1, 0xfb, 0xe8, 0x12, 0xce, 0x34, 0x27, 0xdd, 0x99, 0x63, 0x70, 0x8a, 0x56, 0xac, 0xbf, 0x45, 0x1a, 0xe0, 0xf3, 0x9, 0xd5, 0x2f, 0x3c, 0xc6},
+ {0x0, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50, 0xb, 0xf0, 0xe0, 0x1b, 0xc0, 0x3b, 0x2b, 0xd0, 0x80, 0x7b, 0x6b, 0x90, 0x4b, 0xb0, 0xa0, 0x5b, 0x16, 0xed, 0xfd, 0x6, 0xdd, 0x26, 0x36, 0xcd, 0x9d, 0x66, 0x76, 0x8d, 0x56, 0xad, 0xbd, 0x46, 0x1d, 0xe6, 0xf6, 0xd, 0xd6, 0x2d, 0x3d, 0xc6, 0x96, 0x6d, 0x7d, 0x86, 0x5d, 0xa6, 0xb6, 0x4d, 0x2c, 0xd7, 0xc7, 0x3c, 0xe7, 0x1c, 0xc, 0xf7, 0xa7, 0x5c, 0x4c, 0xb7, 0x6c, 0x97, 0x87, 0x7c, 0x27, 0xdc, 0xcc, 0x37, 0xec, 0x17, 0x7, 0xfc, 0xac, 0x57, 0x47, 0xbc, 0x67, 0x9c, 0x8c, 0x77, 0x3a, 0xc1, 0xd1, 0x2a, 0xf1, 0xa, 0x1a, 0xe1, 0xb1, 0x4a, 0x5a, 0xa1, 0x7a, 0x81, 0x91, 0x6a, 0x31, 0xca, 0xda, 0x21, 0xfa, 0x1, 0x11, 0xea, 0xba, 0x41, 0x51, 0xaa, 0x71, 0x8a, 0x9a, 0x61, 0x58, 0xa3, 0xb3, 0x48, 0x93, 0x68, 0x78, 0x83, 0xd3, 0x28, 0x38, 0xc3, 0x18, 0xe3, 0xf3, 0x8, 0x53, 0xa8, 0xb8, 0x43, 0x98, 0x63, 0x73, 0x88, 0xd8, 0x23, 0x33, 0xc8, 0x13, 0xe8, 0xf8, 0x3, 0x4e, 0xb5, 0xa5, 0x5e, 0x85, 0x7e, 0x6e, 0x95, 0xc5, 0x3e, 0x2e, 0xd5, 0xe, 0xf5, 0xe5, 0x1e, 0x45, 0xbe, 0xae, 0x55, 0x8e, 0x75, 0x65, 0x9e, 0xce, 0x35, 0x25, 0xde, 0x5, 0xfe, 0xee, 0x15, 0x74, 0x8f, 0x9f, 0x64, 0xbf, 0x44, 0x54, 0xaf, 0xff, 0x4, 0x14, 0xef, 0x34, 0xcf, 0xdf, 0x24, 0x7f, 0x84, 0x94, 0x6f, 0xb4, 0x4f, 0x5f, 0xa4, 0xf4, 0xf, 0x1f, 0xe4, 0x3f, 0xc4, 0xd4, 0x2f, 0x62, 0x99, 0x89, 0x72, 0xa9, 0x52, 0x42, 0xb9, 0xe9, 0x12, 0x2, 0xf9, 0x22, 0xd9, 0xc9, 0x32, 0x69, 0x92, 0x82, 0x79, 0xa2, 0x59, 0x49, 0xb2, 0xe2, 0x19, 0x9, 0xf2, 0x29, 0xd2, 0xc2, 0x39},
+ {0x0, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d, 0x7b, 0x87, 0x9e, 0x62, 0xac, 0x50, 0x49, 0xb5, 0xc8, 0x34, 0x2d, 0xd1, 0x1f, 0xe3, 0xfa, 0x6, 0xf6, 0xa, 0x13, 0xef, 0x21, 0xdd, 0xc4, 0x38, 0x45, 0xb9, 0xa0, 0x5c, 0x92, 0x6e, 0x77, 0x8b, 0x8d, 0x71, 0x68, 0x94, 0x5a, 0xa6, 0xbf, 0x43, 0x3e, 0xc2, 0xdb, 0x27, 0xe9, 0x15, 0xc, 0xf0, 0xf1, 0xd, 0x14, 0xe8, 0x26, 0xda, 0xc3, 0x3f, 0x42, 0xbe, 0xa7, 0x5b, 0x95, 0x69, 0x70, 0x8c, 0x8a, 0x76, 0x6f, 0x93, 0x5d, 0xa1, 0xb8, 0x44, 0x39, 0xc5, 0xdc, 0x20, 0xee, 0x12, 0xb, 0xf7, 0x7, 0xfb, 0xe2, 0x1e, 0xd0, 0x2c, 0x35, 0xc9, 0xb4, 0x48, 0x51, 0xad, 0x63, 0x9f, 0x86, 0x7a, 0x7c, 0x80, 0x99, 0x65, 0xab, 0x57, 0x4e, 0xb2, 0xcf, 0x33, 0x2a, 0xd6, 0x18, 0xe4, 0xfd, 0x1, 0xff, 0x3, 0x1a, 0xe6, 0x28, 0xd4, 0xcd, 0x31, 0x4c, 0xb0, 0xa9, 0x55, 0x9b, 0x67, 0x7e, 0x82, 0x84, 0x78, 0x61, 0x9d, 0x53, 0xaf, 0xb6, 0x4a, 0x37, 0xcb, 0xd2, 0x2e, 0xe0, 0x1c, 0x5, 0xf9, 0x9, 0xf5, 0xec, 0x10, 0xde, 0x22, 0x3b, 0xc7, 0xba, 0x46, 0x5f, 0xa3, 0x6d, 0x91, 0x88, 0x74, 0x72, 0x8e, 0x97, 0x6b, 0xa5, 0x59, 0x40, 0xbc, 0xc1, 0x3d, 0x24, 0xd8, 0x16, 0xea, 0xf3, 0xf, 0xe, 0xf2, 0xeb, 0x17, 0xd9, 0x25, 0x3c, 0xc0, 0xbd, 0x41, 0x58, 0xa4, 0x6a, 0x96, 0x8f, 0x73, 0x75, 0x89, 0x90, 0x6c, 0xa2, 0x5e, 0x47, 0xbb, 0xc6, 0x3a, 0x23, 0xdf, 0x11, 0xed, 0xf4, 0x8, 0xf8, 0x4, 0x1d, 0xe1, 0x2f, 0xd3, 0xca, 0x36, 0x4b, 0xb7, 0xae, 0x52, 0x9c, 0x60, 0x79, 0x85, 0x83, 0x7f, 0x66, 0x9a, 0x54, 0xa8, 0xb1, 0x4d, 0x30, 0xcc, 0xd5, 0x29, 0xe7, 0x1b, 0x2, 0xfe},
+ {0x0, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72, 0x6b, 0x96, 0x8c, 0x71, 0xb8, 0x45, 0x5f, 0xa2, 0xd0, 0x2d, 0x37, 0xca, 0x3, 0xfe, 0xe4, 0x19, 0xd6, 0x2b, 0x31, 0xcc, 0x5, 0xf8, 0xe2, 0x1f, 0x6d, 0x90, 0x8a, 0x77, 0xbe, 0x43, 0x59, 0xa4, 0xbd, 0x40, 0x5a, 0xa7, 0x6e, 0x93, 0x89, 0x74, 0x6, 0xfb, 0xe1, 0x1c, 0xd5, 0x28, 0x32, 0xcf, 0xb1, 0x4c, 0x56, 0xab, 0x62, 0x9f, 0x85, 0x78, 0xa, 0xf7, 0xed, 0x10, 0xd9, 0x24, 0x3e, 0xc3, 0xda, 0x27, 0x3d, 0xc0, 0x9, 0xf4, 0xee, 0x13, 0x61, 0x9c, 0x86, 0x7b, 0xb2, 0x4f, 0x55, 0xa8, 0x67, 0x9a, 0x80, 0x7d, 0xb4, 0x49, 0x53, 0xae, 0xdc, 0x21, 0x3b, 0xc6, 0xf, 0xf2, 0xe8, 0x15, 0xc, 0xf1, 0xeb, 0x16, 0xdf, 0x22, 0x38, 0xc5, 0xb7, 0x4a, 0x50, 0xad, 0x64, 0x99, 0x83, 0x7e, 0x7f, 0x82, 0x98, 0x65, 0xac, 0x51, 0x4b, 0xb6, 0xc4, 0x39, 0x23, 0xde, 0x17, 0xea, 0xf0, 0xd, 0x14, 0xe9, 0xf3, 0xe, 0xc7, 0x3a, 0x20, 0xdd, 0xaf, 0x52, 0x48, 0xb5, 0x7c, 0x81, 0x9b, 0x66, 0xa9, 0x54, 0x4e, 0xb3, 0x7a, 0x87, 0x9d, 0x60, 0x12, 0xef, 0xf5, 0x8, 0xc1, 0x3c, 0x26, 0xdb, 0xc2, 0x3f, 0x25, 0xd8, 0x11, 0xec, 0xf6, 0xb, 0x79, 0x84, 0x9e, 0x63, 0xaa, 0x57, 0x4d, 0xb0, 0xce, 0x33, 0x29, 0xd4, 0x1d, 0xe0, 0xfa, 0x7, 0x75, 0x88, 0x92, 0x6f, 0xa6, 0x5b, 0x41, 0xbc, 0xa5, 0x58, 0x42, 0xbf, 0x76, 0x8b, 0x91, 0x6c, 0x1e, 0xe3, 0xf9, 0x4, 0xcd, 0x30, 0x2a, 0xd7, 0x18, 0xe5, 0xff, 0x2, 0xcb, 0x36, 0x2c, 0xd1, 0xa3, 0x5e, 0x44, 0xb9, 0x70, 0x8d, 0x97, 0x6a, 0x73, 0x8e, 0x94, 0x69, 0xa0, 0x5d, 0x47, 0xba, 0xc8, 0x35, 0x2f, 0xd2, 0x1b, 0xe6, 0xfc, 0x1},
+ {0x0, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63, 0x5b, 0xa5, 0xba, 0x44, 0x84, 0x7a, 0x65, 0x9b, 0xf8, 0x6, 0x19, 0xe7, 0x27, 0xd9, 0xc6, 0x38, 0xb6, 0x48, 0x57, 0xa9, 0x69, 0x97, 0x88, 0x76, 0x15, 0xeb, 0xf4, 0xa, 0xca, 0x34, 0x2b, 0xd5, 0xed, 0x13, 0xc, 0xf2, 0x32, 0xcc, 0xd3, 0x2d, 0x4e, 0xb0, 0xaf, 0x51, 0x91, 0x6f, 0x70, 0x8e, 0x71, 0x8f, 0x90, 0x6e, 0xae, 0x50, 0x4f, 0xb1, 0xd2, 0x2c, 0x33, 0xcd, 0xd, 0xf3, 0xec, 0x12, 0x2a, 0xd4, 0xcb, 0x35, 0xf5, 0xb, 0x14, 0xea, 0x89, 0x77, 0x68, 0x96, 0x56, 0xa8, 0xb7, 0x49, 0xc7, 0x39, 0x26, 0xd8, 0x18, 0xe6, 0xf9, 0x7, 0x64, 0x9a, 0x85, 0x7b, 0xbb, 0x45, 0x5a, 0xa4, 0x9c, 0x62, 0x7d, 0x83, 0x43, 0xbd, 0xa2, 0x5c, 0x3f, 0xc1, 0xde, 0x20, 0xe0, 0x1e, 0x1, 0xff, 0xe2, 0x1c, 0x3, 0xfd, 0x3d, 0xc3, 0xdc, 0x22, 0x41, 0xbf, 0xa0, 0x5e, 0x9e, 0x60, 0x7f, 0x81, 0xb9, 0x47, 0x58, 0xa6, 0x66, 0x98, 0x87, 0x79, 0x1a, 0xe4, 0xfb, 0x5, 0xc5, 0x3b, 0x24, 0xda, 0x54, 0xaa, 0xb5, 0x4b, 0x8b, 0x75, 0x6a, 0x94, 0xf7, 0x9, 0x16, 0xe8, 0x28, 0xd6, 0xc9, 0x37, 0xf, 0xf1, 0xee, 0x10, 0xd0, 0x2e, 0x31, 0xcf, 0xac, 0x52, 0x4d, 0xb3, 0x73, 0x8d, 0x92, 0x6c, 0x93, 0x6d, 0x72, 0x8c, 0x4c, 0xb2, 0xad, 0x53, 0x30, 0xce, 0xd1, 0x2f, 0xef, 0x11, 0xe, 0xf0, 0xc8, 0x36, 0x29, 0xd7, 0x17, 0xe9, 0xf6, 0x8, 0x6b, 0x95, 0x8a, 0x74, 0xb4, 0x4a, 0x55, 0xab, 0x25, 0xdb, 0xc4, 0x3a, 0xfa, 0x4, 0x1b, 0xe5, 0x86, 0x78, 0x67, 0x99, 0x59, 0xa7, 0xb8, 0x46, 0x7e, 0x80, 0x9f, 0x61, 0xa1, 0x5f, 0x40, 0xbe, 0xdd, 0x23, 0x3c, 0xc2, 0x2, 0xfc, 0xe3, 0x1d},
+ {0x0, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c, 0x4b, 0xb4, 0xa8, 0x57, 0x90, 0x6f, 0x73, 0x8c, 0xe0, 0x1f, 0x3, 0xfc, 0x3b, 0xc4, 0xd8, 0x27, 0x96, 0x69, 0x75, 0x8a, 0x4d, 0xb2, 0xae, 0x51, 0x3d, 0xc2, 0xde, 0x21, 0xe6, 0x19, 0x5, 0xfa, 0xdd, 0x22, 0x3e, 0xc1, 0x6, 0xf9, 0xe5, 0x1a, 0x76, 0x89, 0x95, 0x6a, 0xad, 0x52, 0x4e, 0xb1, 0x31, 0xce, 0xd2, 0x2d, 0xea, 0x15, 0x9, 0xf6, 0x9a, 0x65, 0x79, 0x86, 0x41, 0xbe, 0xa2, 0x5d, 0x7a, 0x85, 0x99, 0x66, 0xa1, 0x5e, 0x42, 0xbd, 0xd1, 0x2e, 0x32, 0xcd, 0xa, 0xf5, 0xe9, 0x16, 0xa7, 0x58, 0x44, 0xbb, 0x7c, 0x83, 0x9f, 0x60, 0xc, 0xf3, 0xef, 0x10, 0xd7, 0x28, 0x34, 0xcb, 0xec, 0x13, 0xf, 0xf0, 0x37, 0xc8, 0xd4, 0x2b, 0x47, 0xb8, 0xa4, 0x5b, 0x9c, 0x63, 0x7f, 0x80, 0x62, 0x9d, 0x81, 0x7e, 0xb9, 0x46, 0x5a, 0xa5, 0xc9, 0x36, 0x2a, 0xd5, 0x12, 0xed, 0xf1, 0xe, 0x29, 0xd6, 0xca, 0x35, 0xf2, 0xd, 0x11, 0xee, 0x82, 0x7d, 0x61, 0x9e, 0x59, 0xa6, 0xba, 0x45, 0xf4, 0xb, 0x17, 0xe8, 0x2f, 0xd0, 0xcc, 0x33, 0x5f, 0xa0, 0xbc, 0x43, 0x84, 0x7b, 0x67, 0x98, 0xbf, 0x40, 0x5c, 0xa3, 0x64, 0x9b, 0x87, 0x78, 0x14, 0xeb, 0xf7, 0x8, 0xcf, 0x30, 0x2c, 0xd3, 0x53, 0xac, 0xb0, 0x4f, 0x88, 0x77, 0x6b, 0x94, 0xf8, 0x7, 0x1b, 0xe4, 0x23, 0xdc, 0xc0, 0x3f, 0x18, 0xe7, 0xfb, 0x4, 0xc3, 0x3c, 0x20, 0xdf, 0xb3, 0x4c, 0x50, 0xaf, 0x68, 0x97, 0x8b, 0x74, 0xc5, 0x3a, 0x26, 0xd9, 0x1e, 0xe1, 0xfd, 0x2, 0x6e, 0x91, 0x8d, 0x72, 0xb5, 0x4a, 0x56, 0xa9, 0x8e, 0x71, 0x6d, 0x92, 0x55, 0xaa, 0xb6, 0x49, 0x25, 0xda, 0xc6, 0x39, 0xfe, 0x1, 0x1d, 0xe2}}
+
+var mulTableLow = [256][16]uint8{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
+ {0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e},
+ {0x0, 0x3, 0x6, 0x5, 0xc, 0xf, 0xa, 0x9, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11},
+ {0x0, 0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c},
+ {0x0, 0x5, 0xa, 0xf, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33},
+ {0x0, 0x6, 0xc, 0xa, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22},
+ {0x0, 0x7, 0xe, 0x9, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d},
+ {0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78},
+ {0x0, 0x9, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77},
+ {0x0, 0xa, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66},
+ {0x0, 0xb, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69},
+ {0x0, 0xc, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44},
+ {0x0, 0xd, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b},
+ {0x0, 0xe, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a},
+ {0x0, 0xf, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55},
+ {0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0},
+ {0x0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff},
+ {0x0, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee},
+ {0x0, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1},
+ {0x0, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc},
+ {0x0, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3},
+ {0x0, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2},
+ {0x0, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd},
+ {0x0, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88},
+ {0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87},
+ {0x0, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96},
+ {0x0, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99},
+ {0x0, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4},
+ {0x0, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb},
+ {0x0, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa},
+ {0x0, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5},
+ {0x0, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd},
+ {0x0, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2},
+ {0x0, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0xd, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3},
+ {0x0, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x5, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec},
+ {0x0, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1},
+ {0x0, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce},
+ {0x0, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0xb, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf},
+ {0x0, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x2, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0},
+ {0x0, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0xd, 0x25, 0xfd, 0xd5, 0xad, 0x85},
+ {0x0, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x7, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a},
+ {0x0, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b},
+ {0x0, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94},
+ {0x0, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x9, 0xcd, 0xe1, 0x95, 0xb9},
+ {0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x2, 0xc1, 0xec, 0x9b, 0xb6},
+ {0x0, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7},
+ {0x0, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8},
+ {0x0, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0xd},
+ {0x0, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x2},
+ {0x0, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13},
+ {0x0, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c},
+ {0x0, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x5, 0x31},
+ {0x0, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0xb, 0x3e},
+ {0x0, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f},
+ {0x0, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20},
+ {0x0, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x5, 0x4d, 0x75},
+ {0x0, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x8, 0x43, 0x7a},
+ {0x0, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b},
+ {0x0, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64},
+ {0x0, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0xd, 0x31, 0x75, 0x49},
+ {0x0, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x1, 0x3c, 0x7b, 0x46},
+ {0x0, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57},
+ {0x0, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58},
+ {0x0, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7},
+ {0x0, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8},
+ {0x0, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9},
+ {0x0, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6},
+ {0x0, 0x44, 0x88, 0xcc, 0xd, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb},
+ {0x0, 0x45, 0x8a, 0xcf, 0x9, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4},
+ {0x0, 0x46, 0x8c, 0xca, 0x5, 0x43, 0x89, 0xcf, 0xa, 0x4c, 0x86, 0xc0, 0xf, 0x49, 0x83, 0xc5},
+ {0x0, 0x47, 0x8e, 0xc9, 0x1, 0x46, 0x8f, 0xc8, 0x2, 0x45, 0x8c, 0xcb, 0x3, 0x44, 0x8d, 0xca},
+ {0x0, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0xf, 0xd7, 0x9f},
+ {0x0, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x2, 0xd9, 0x90},
+ {0x0, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81},
+ {0x0, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e},
+ {0x0, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3},
+ {0x0, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac},
+ {0x0, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x4, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd},
+ {0x0, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0xd, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2},
+ {0x0, 0x50, 0xa0, 0xf0, 0x5d, 0xd, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17},
+ {0x0, 0x51, 0xa2, 0xf3, 0x59, 0x8, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18},
+ {0x0, 0x52, 0xa4, 0xf6, 0x55, 0x7, 0xf1, 0xa3, 0xaa, 0xf8, 0xe, 0x5c, 0xff, 0xad, 0x5b, 0x9},
+ {0x0, 0x53, 0xa6, 0xf5, 0x51, 0x2, 0xf7, 0xa4, 0xa2, 0xf1, 0x4, 0x57, 0xf3, 0xa0, 0x55, 0x6},
+ {0x0, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b},
+ {0x0, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24},
+ {0x0, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35},
+ {0x0, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a},
+ {0x0, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f},
+ {0x0, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60},
+ {0x0, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x4, 0x9f, 0xc5, 0x2b, 0x71},
+ {0x0, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0xf, 0x93, 0xc8, 0x25, 0x7e},
+ {0x0, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0xf, 0x53},
+ {0x0, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x1, 0x5c},
+ {0x0, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d},
+ {0x0, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42},
+ {0x0, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a},
+ {0x0, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15},
+ {0x0, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x4},
+ {0x0, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0xb},
+ {0x0, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x7, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26},
+ {0x0, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0xf, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29},
+ {0x0, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38},
+ {0x0, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37},
+ {0x0, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x5, 0x67, 0xf, 0xb7, 0xdf, 0xda, 0xb2, 0xa, 0x62},
+ {0x0, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x2, 0x6f, 0x6, 0xbd, 0xd4, 0xd6, 0xbf, 0x4, 0x6d},
+ {0x0, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0xb, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c},
+ {0x0, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0xc, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73},
+ {0x0, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e},
+ {0x0, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51},
+ {0x0, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40},
+ {0x0, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f},
+ {0x0, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0xa, 0x9a, 0xea},
+ {0x0, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x7, 0x94, 0xe5},
+ {0x0, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4},
+ {0x0, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb},
+ {0x0, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6},
+ {0x0, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9},
+ {0x0, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0xd, 0x52, 0x24, 0xbe, 0xc8},
+ {0x0, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x6, 0x5e, 0x29, 0xb0, 0xc7},
+ {0x0, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0xd, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92},
+ {0x0, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0xb, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d},
+ {0x0, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x1, 0x7b, 0xf7, 0x8d, 0x3, 0x79, 0x2, 0x78, 0xf6, 0x8c},
+ {0x0, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x7, 0x7c, 0xff, 0x84, 0x9, 0x72, 0xe, 0x75, 0xf8, 0x83},
+ {0x0, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae},
+ {0x0, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1},
+ {0x0, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0},
+ {0x0, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf},
+ {0x0, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3},
+ {0x0, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc},
+ {0x0, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd},
+ {0x0, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2},
+ {0x0, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef},
+ {0x0, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0},
+ {0x0, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1},
+ {0x0, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe},
+ {0x0, 0x88, 0xd, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab},
+ {0x0, 0x89, 0xf, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4},
+ {0x0, 0x8a, 0x9, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5},
+ {0x0, 0x8b, 0xb, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba},
+ {0x0, 0x8c, 0x5, 0x89, 0xa, 0x86, 0xf, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97},
+ {0x0, 0x8d, 0x7, 0x8a, 0xe, 0x83, 0x9, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98},
+ {0x0, 0x8e, 0x1, 0x8f, 0x2, 0x8c, 0x3, 0x8d, 0x4, 0x8a, 0x5, 0x8b, 0x6, 0x88, 0x7, 0x89},
+ {0x0, 0x8f, 0x3, 0x8c, 0x6, 0x89, 0x5, 0x8a, 0xc, 0x83, 0xf, 0x80, 0xa, 0x85, 0x9, 0x86},
+ {0x0, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23},
+ {0x0, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c},
+ {0x0, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x4, 0xaf, 0x3d},
+ {0x0, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x9, 0xa1, 0x32},
+ {0x0, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f},
+ {0x0, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10},
+ {0x0, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x1},
+ {0x0, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0xe},
+ {0x0, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x1, 0xee, 0x76, 0xc3, 0x5b},
+ {0x0, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0xa, 0xe2, 0x7b, 0xcd, 0x54},
+ {0x0, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45},
+ {0x0, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a},
+ {0x0, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x8, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67},
+ {0x0, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x1, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68},
+ {0x0, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79},
+ {0x0, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76},
+ {0x0, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e},
+ {0x0, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21},
+ {0x0, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30},
+ {0x0, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f},
+ {0x0, 0xa4, 0x55, 0xf1, 0xaa, 0xe, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12},
+ {0x0, 0xa5, 0x57, 0xf2, 0xae, 0xb, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d},
+ {0x0, 0xa6, 0x51, 0xf7, 0xa2, 0x4, 0xf3, 0x55, 0x59, 0xff, 0x8, 0xae, 0xfb, 0x5d, 0xaa, 0xc},
+ {0x0, 0xa7, 0x53, 0xf4, 0xa6, 0x1, 0xf5, 0x52, 0x51, 0xf6, 0x2, 0xa5, 0xf7, 0x50, 0xa4, 0x3},
+ {0x0, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56},
+ {0x0, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59},
+ {0x0, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x1, 0xe2, 0x48},
+ {0x0, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0xc, 0xec, 0x47},
+ {0x0, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x9, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a},
+ {0x0, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x1, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65},
+ {0x0, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74},
+ {0x0, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b},
+ {0x0, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde},
+ {0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1},
+ {0x0, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0xb, 0xb9, 0x72, 0xc0},
+ {0x0, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x7, 0xb4, 0x7c, 0xcf},
+ {0x0, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x8, 0x23, 0x97, 0x56, 0xe2},
+ {0x0, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x3, 0x2f, 0x9a, 0x58, 0xed},
+ {0x0, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc},
+ {0x0, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3},
+ {0x0, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0xf, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6},
+ {0x0, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x8, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9},
+ {0x0, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x1, 0xb9, 0x3, 0xd0, 0x6a, 0x6b, 0xd1, 0x2, 0xb8},
+ {0x0, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x6, 0xb1, 0xa, 0xda, 0x61, 0x67, 0xdc, 0xc, 0xb7},
+ {0x0, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a},
+ {0x0, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95},
+ {0x0, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84},
+ {0x0, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b},
+ {0x0, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34},
+ {0x0, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b},
+ {0x0, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x5, 0x71, 0xb3, 0xe8, 0x2a},
+ {0x0, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0xe, 0x7d, 0xbe, 0xe6, 0x25},
+ {0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x8},
+ {0x0, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x7},
+ {0x0, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16},
+ {0x0, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19},
+ {0x0, 0xc8, 0x8d, 0x45, 0x7, 0xcf, 0x8a, 0x42, 0xe, 0xc6, 0x83, 0x4b, 0x9, 0xc1, 0x84, 0x4c},
+ {0x0, 0xc9, 0x8f, 0x46, 0x3, 0xca, 0x8c, 0x45, 0x6, 0xcf, 0x89, 0x40, 0x5, 0xcc, 0x8a, 0x43},
+ {0x0, 0xca, 0x89, 0x43, 0xf, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52},
+ {0x0, 0xcb, 0x8b, 0x40, 0xb, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d},
+ {0x0, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70},
+ {0x0, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f},
+ {0x0, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e},
+ {0x0, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61},
+ {0x0, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0xa, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4},
+ {0x0, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0xd, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb},
+ {0x0, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x4, 0xde, 0xc, 0x67, 0xb5, 0xb1, 0x63, 0x8, 0xda},
+ {0x0, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x3, 0xd6, 0x5, 0x6d, 0xbe, 0xbd, 0x6e, 0x6, 0xd5},
+ {0x0, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8},
+ {0x0, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7},
+ {0x0, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6},
+ {0x0, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9},
+ {0x0, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc},
+ {0x0, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3},
+ {0x0, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0xb, 0x78, 0xa2},
+ {0x0, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x6, 0x76, 0xad},
+ {0x0, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0xb, 0xd7, 0xf9, 0x25, 0x5c, 0x80},
+ {0x0, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x1, 0xdc, 0xf5, 0x28, 0x52, 0x8f},
+ {0x0, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e},
+ {0x0, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91},
+ {0x0, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9},
+ {0x0, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6},
+ {0x0, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0xe, 0x35, 0xd7},
+ {0x0, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x3, 0x3b, 0xd8},
+ {0x0, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5},
+ {0x0, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa},
+ {0x0, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0xd, 0xeb},
+ {0x0, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x3, 0xe4},
+ {0x0, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1},
+ {0x0, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe},
+ {0x0, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x3, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf},
+ {0x0, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0xb, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0},
+ {0x0, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d},
+ {0x0, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82},
+ {0x0, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0xc, 0xbc, 0x52, 0x7d, 0x93},
+ {0x0, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x7, 0xb0, 0x5f, 0x73, 0x9c},
+ {0x0, 0xf0, 0xfd, 0xd, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39},
+ {0x0, 0xf1, 0xff, 0xe, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36},
+ {0x0, 0xf2, 0xf9, 0xb, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27},
+ {0x0, 0xf3, 0xfb, 0x8, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28},
+ {0x0, 0xf4, 0xf5, 0x1, 0xf7, 0x3, 0x2, 0xf6, 0xf3, 0x7, 0x6, 0xf2, 0x4, 0xf0, 0xf1, 0x5},
+ {0x0, 0xf5, 0xf7, 0x2, 0xf3, 0x6, 0x4, 0xf1, 0xfb, 0xe, 0xc, 0xf9, 0x8, 0xfd, 0xff, 0xa},
+ {0x0, 0xf6, 0xf1, 0x7, 0xff, 0x9, 0xe, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b},
+ {0x0, 0xf7, 0xf3, 0x4, 0xfb, 0xc, 0x8, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14},
+ {0x0, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41},
+ {0x0, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e},
+ {0x0, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f},
+ {0x0, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50},
+ {0x0, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d},
+ {0x0, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72},
+ {0x0, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63},
+ {0x0, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c}}
+var mulTableHigh = [256][16]uint8{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+ {0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0},
+ {0x0, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd},
+ {0x0, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0xd},
+ {0x0, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7},
+ {0x0, 0x50, 0xa0, 0xf0, 0x5d, 0xd, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17},
+ {0x0, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a},
+ {0x0, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0xa, 0x9a, 0xea},
+ {0x0, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3},
+ {0x0, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23},
+ {0x0, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e},
+ {0x0, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde},
+ {0x0, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34},
+ {0x0, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0xa, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4},
+ {0x0, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9},
+ {0x0, 0xf0, 0xfd, 0xd, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39},
+ {0x0, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb},
+ {0x0, 0xd, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b},
+ {0x0, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x1, 0x3c, 0x7b, 0x46},
+ {0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x2, 0xc1, 0xec, 0x9b, 0xb6},
+ {0x0, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x1, 0x5c},
+ {0x0, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac},
+ {0x0, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1},
+ {0x0, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51},
+ {0x0, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x1, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68},
+ {0x0, 0x8d, 0x7, 0x8a, 0xe, 0x83, 0x9, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98},
+ {0x0, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95},
+ {0x0, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x1, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65},
+ {0x0, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x1, 0xdc, 0xf5, 0x28, 0x52, 0x8f},
+ {0x0, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f},
+ {0x0, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72},
+ {0x0, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82},
+ {0x0, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b},
+ {0x0, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b},
+ {0x0, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96},
+ {0x0, 0xa, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66},
+ {0x0, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x1, 0x7b, 0xf7, 0x8d, 0x3, 0x79, 0x2, 0x78, 0xf6, 0x8c},
+ {0x0, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0xb, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c},
+ {0x0, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x4, 0x9f, 0xc5, 0x2b, 0x71},
+ {0x0, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81},
+ {0x0, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x1, 0xb9, 0x3, 0xd0, 0x6a, 0x6b, 0xd1, 0x2, 0xb8},
+ {0x0, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x1, 0xe2, 0x48},
+ {0x0, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45},
+ {0x0, 0x8a, 0x9, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5},
+ {0x0, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f},
+ {0x0, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x3, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf},
+ {0x0, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0xb, 0x78, 0xa2},
+ {0x0, 0xca, 0x89, 0x43, 0xf, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52},
+ {0x0, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x2, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0},
+ {0x0, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20},
+ {0x0, 0x7, 0xe, 0x9, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d},
+ {0x0, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd},
+ {0x0, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37},
+ {0x0, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x6, 0x5e, 0x29, 0xb0, 0xc7},
+ {0x0, 0x47, 0x8e, 0xc9, 0x1, 0x46, 0x8f, 0xc8, 0x2, 0x45, 0x8c, 0xcb, 0x3, 0x44, 0x8d, 0xca},
+ {0x0, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a},
+ {0x0, 0xa7, 0x53, 0xf4, 0xa6, 0x1, 0xf5, 0x52, 0x51, 0xf6, 0x2, 0xa5, 0xf7, 0x50, 0xa4, 0x3},
+ {0x0, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3},
+ {0x0, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe},
+ {0x0, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0xe},
+ {0x0, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x3, 0xe4},
+ {0x0, 0xf7, 0xf3, 0x4, 0xfb, 0xc, 0x8, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14},
+ {0x0, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19},
+ {0x0, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9},
+ {0x0, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6},
+ {0x0, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x7, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26},
+ {0x0, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b},
+ {0x0, 0x44, 0x88, 0xcc, 0xd, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb},
+ {0x0, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x5, 0x31},
+ {0x0, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1},
+ {0x0, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc},
+ {0x0, 0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c},
+ {0x0, 0xf4, 0xf5, 0x1, 0xf7, 0x3, 0x2, 0xf6, 0xf3, 0x7, 0x6, 0xf2, 0x4, 0xf0, 0xf1, 0x5},
+ {0x0, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5},
+ {0x0, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8},
+ {0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x8},
+ {0x0, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x8, 0x23, 0x97, 0x56, 0xe2},
+ {0x0, 0xa4, 0x55, 0xf1, 0xaa, 0xe, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12},
+ {0x0, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f},
+ {0x0, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef},
+ {0x0, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x2, 0x6f, 0x6, 0xbd, 0xd4, 0xd6, 0xbf, 0x4, 0x6d},
+ {0x0, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0xb, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d},
+ {0x0, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x2, 0xd9, 0x90},
+ {0x0, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60},
+ {0x0, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x7, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a},
+ {0x0, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x8, 0x43, 0x7a},
+ {0x0, 0x9, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77},
+ {0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87},
+ {0x0, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe},
+ {0x0, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e},
+ {0x0, 0xc9, 0x8f, 0x46, 0x3, 0xca, 0x8c, 0x45, 0x6, 0xcf, 0x89, 0x40, 0x5, 0xcc, 0x8a, 0x43},
+ {0x0, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3},
+ {0x0, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59},
+ {0x0, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x8, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9},
+ {0x0, 0x89, 0xf, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4},
+ {0x0, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0xa, 0xe2, 0x7b, 0xcd, 0x54},
+ {0x0, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x4, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd},
+ {0x0, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d},
+ {0x0, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40},
+ {0x0, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0},
+ {0x0, 0xe, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a},
+ {0x0, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa},
+ {0x0, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7},
+ {0x0, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57},
+ {0x0, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e},
+ {0x0, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e},
+ {0x0, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0xc, 0xbc, 0x52, 0x7d, 0x93},
+ {0x0, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63},
+ {0x0, 0x8e, 0x1, 0x8f, 0x2, 0x8c, 0x3, 0x8d, 0x4, 0x8a, 0x5, 0x8b, 0x6, 0x88, 0x7, 0x89},
+ {0x0, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79},
+ {0x0, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74},
+ {0x0, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84},
+ {0x0, 0x53, 0xa6, 0xf5, 0x51, 0x2, 0xf7, 0xa4, 0xa2, 0xf1, 0x4, 0x57, 0xf3, 0xa0, 0x55, 0x6},
+ {0x0, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6},
+ {0x0, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb},
+ {0x0, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0xb},
+ {0x0, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1},
+ {0x0, 0x3, 0x6, 0x5, 0xc, 0xf, 0xa, 0x9, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11},
+ {0x0, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c},
+ {0x0, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x5, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec},
+ {0x0, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x3, 0xd6, 0x5, 0x6d, 0xbe, 0xbd, 0x6e, 0x6, 0xd5},
+ {0x0, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0xe, 0x7d, 0xbe, 0xe6, 0x25},
+ {0x0, 0xf3, 0xfb, 0x8, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28},
+ {0x0, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x3, 0x3b, 0xd8},
+ {0x0, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x9, 0xa1, 0x32},
+ {0x0, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2},
+ {0x0, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x7, 0xb4, 0x7c, 0xcf},
+ {0x0, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f},
+ {0x0, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1},
+ {0x0, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41},
+ {0x0, 0xc8, 0x8d, 0x45, 0x7, 0xcf, 0x8a, 0x42, 0xe, 0xc6, 0x83, 0x4b, 0x9, 0xc1, 0x84, 0x4c},
+ {0x0, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc},
+ {0x0, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56},
+ {0x0, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0xf, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6},
+ {0x0, 0x88, 0xd, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab},
+ {0x0, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x1, 0xee, 0x76, 0xc3, 0x5b},
+ {0x0, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x5, 0x67, 0xf, 0xb7, 0xdf, 0xda, 0xb2, 0xa, 0x62},
+ {0x0, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0xd, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92},
+ {0x0, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0xf, 0xd7, 0x9f},
+ {0x0, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f},
+ {0x0, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0xd, 0x25, 0xfd, 0xd5, 0xad, 0x85},
+ {0x0, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x5, 0x4d, 0x75},
+ {0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78},
+ {0x0, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88},
+ {0x0, 0xf5, 0xf7, 0x2, 0xf3, 0x6, 0x4, 0xf1, 0xfb, 0xe, 0xc, 0xf9, 0x8, 0xfd, 0xff, 0xa},
+ {0x0, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa},
+ {0x0, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7},
+ {0x0, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x7},
+ {0x0, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x3, 0x2f, 0x9a, 0x58, 0xed},
+ {0x0, 0xa5, 0x57, 0xf2, 0xae, 0xb, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d},
+ {0x0, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10},
+ {0x0, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0},
+ {0x0, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9},
+ {0x0, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0xf, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29},
+ {0x0, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24},
+ {0x0, 0x45, 0x8a, 0xcf, 0x9, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4},
+ {0x0, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0xb, 0x3e},
+ {0x0, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce},
+ {0x0, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3},
+ {0x0, 0x5, 0xa, 0xf, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33},
+ {0x0, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x4, 0xde, 0xc, 0x67, 0xb5, 0xb1, 0x63, 0x8, 0xda},
+ {0x0, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x5, 0x71, 0xb3, 0xe8, 0x2a},
+ {0x0, 0xf2, 0xf9, 0xb, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27},
+ {0x0, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0xe, 0x35, 0xd7},
+ {0x0, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x4, 0xaf, 0x3d},
+ {0x0, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd},
+ {0x0, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0xb, 0xb9, 0x72, 0xc0},
+ {0x0, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30},
+ {0x0, 0x52, 0xa4, 0xf6, 0x55, 0x7, 0xf1, 0xa3, 0xaa, 0xf8, 0xe, 0x5c, 0xff, 0xad, 0x5b, 0x9},
+ {0x0, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9},
+ {0x0, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4},
+ {0x0, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x4},
+ {0x0, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee},
+ {0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e},
+ {0x0, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13},
+ {0x0, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0xd, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3},
+ {0x0, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61},
+ {0x0, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91},
+ {0x0, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x7, 0xb0, 0x5f, 0x73, 0x9c},
+ {0x0, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c},
+ {0x0, 0x8f, 0x3, 0x8c, 0x6, 0x89, 0x5, 0x8a, 0xc, 0x83, 0xf, 0x80, 0xa, 0x85, 0x9, 0x86},
+ {0x0, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76},
+ {0x0, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b},
+ {0x0, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b},
+ {0x0, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0xd, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2},
+ {0x0, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42},
+ {0x0, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f},
+ {0x0, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf},
+ {0x0, 0xf, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55},
+ {0x0, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5},
+ {0x0, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8},
+ {0x0, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58},
+ {0x0, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x8, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67},
+ {0x0, 0x8c, 0x5, 0x89, 0xa, 0x86, 0xf, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97},
+ {0x0, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a},
+ {0x0, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x9, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a},
+ {0x0, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0xb, 0xd7, 0xf9, 0x25, 0x5c, 0x80},
+ {0x0, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70},
+ {0x0, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d},
+ {0x0, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d},
+ {0x0, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4},
+ {0x0, 0xc, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44},
+ {0x0, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0xd, 0x31, 0x75, 0x49},
+ {0x0, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x9, 0xcd, 0xe1, 0x95, 0xb9},
+ {0x0, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0xf, 0x53},
+ {0x0, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3},
+ {0x0, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae},
+ {0x0, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e},
+ {0x0, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc},
+ {0x0, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c},
+ {0x0, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21},
+ {0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1},
+ {0x0, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b},
+ {0x0, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0xd, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb},
+ {0x0, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6},
+ {0x0, 0xf1, 0xff, 0xe, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36},
+ {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
+ {0x0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff},
+ {0x0, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2},
+ {0x0, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x2},
+ {0x0, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8},
+ {0x0, 0x51, 0xa2, 0xf3, 0x59, 0x8, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18},
+ {0x0, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15},
+ {0x0, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x7, 0x94, 0xe5},
+ {0x0, 0xa6, 0x51, 0xf7, 0xa2, 0x4, 0xf3, 0x55, 0x59, 0xff, 0x8, 0xae, 0xfb, 0x5d, 0xaa, 0xc},
+ {0x0, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc},
+ {0x0, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1},
+ {0x0, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x1},
+ {0x0, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0xd, 0xeb},
+ {0x0, 0xf6, 0xf1, 0x7, 0xff, 0x9, 0xe, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b},
+ {0x0, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16},
+ {0x0, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6},
+ {0x0, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0xb, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf},
+ {0x0, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f},
+ {0x0, 0x6, 0xc, 0xa, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22},
+ {0x0, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2},
+ {0x0, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38},
+ {0x0, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0xd, 0x52, 0x24, 0xbe, 0xc8},
+ {0x0, 0x46, 0x8c, 0xca, 0x5, 0x43, 0x89, 0xcf, 0xa, 0x4c, 0x86, 0xc0, 0xf, 0x49, 0x83, 0xc5},
+ {0x0, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35},
+ {0x0, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x6, 0xb1, 0xa, 0xda, 0x61, 0x67, 0xdc, 0xc, 0xb7},
+ {0x0, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0xc, 0xec, 0x47},
+ {0x0, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a},
+ {0x0, 0x8b, 0xb, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba},
+ {0x0, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50},
+ {0x0, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0xb, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0},
+ {0x0, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x6, 0x76, 0xad},
+ {0x0, 0xcb, 0x8b, 0x40, 0xb, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d},
+ {0x0, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64},
+ {0x0, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94},
+ {0x0, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99},
+ {0x0, 0xb, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69},
+ {0x0, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x7, 0x7c, 0xff, 0x84, 0x9, 0x72, 0xe, 0x75, 0xf8, 0x83},
+ {0x0, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0xc, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73},
+ {0x0, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0xf, 0x93, 0xc8, 0x25, 0x7e},
+ {0x0, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e}}
+
+// galMultiply multiplies to elements of the field.
+// Uses lookup table ~40% faster
+func galMultiply(a, b byte) byte {
+ return mulTable[a][b]
+}
+
+// Original function:
+/*
+// galMultiply multiplies to elements of the field.
+func galMultiply(a, b byte) byte {
+ if a == 0 || b == 0 {
+ return 0
+ }
+ logA := int(logTable[a])
+ logB := int(logTable[b])
+ return expTable[logA+logB]
+}
+*/
+
+// galDivide is inverse of galMultiply.
+func galDivide(a, b byte) byte {
+ if a == 0 {
+ return 0
+ }
+ if b == 0 {
+ panic("Argument 'divisor' is 0")
+ }
+ logA := int(logTable[a])
+ logB := int(logTable[b])
+ logResult := logA - logB
+ if logResult < 0 {
+ logResult += 255
+ }
+ return expTable[uint8(logResult)]
+}
+
+// galOneOver is the same as galDivide(1, a).
+func galOneOver(a byte) byte {
+ if a == 0 {
+ panic("Argument 'divisor' is 0")
+ }
+ logResult := logTable[a] ^ 255
+ return expTable[logResult]
+}
+
+// Computes a**n.
+//
+// The result will be the same as multiplying a times itself n times.
+func galExp(a byte, n int) byte {
+ if n == 0 {
+ return 1
+ }
+ if a == 0 {
+ return 0
+ }
+
+ logA := logTable[a]
+ logResult := int(logA) * n
+ for logResult >= 255 {
+ logResult -= 255
+ }
+ return expTable[uint8(logResult)]
+}
+
+func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte {
+ if !avx2CodeGen {
+ panic("codegen not enabled")
+ }
+ total := inputs * outputs
+
+ // Duplicated in+out
+ wantBytes := total * 32 * 2
+ if cap(dst) < wantBytes {
+ dst = AllocAligned(1, wantBytes)[0]
+ } else {
+ dst = dst[:wantBytes]
+ }
+ for i, row := range matrixRows[:outputs] {
+ for j, idx := range row[inIdx : inIdx+inputs] {
+ dstIdx := (j*outputs + i) * 64
+ dstPart := dst[dstIdx:]
+ dstPart = dstPart[:64]
+ lo := mulTableLow[idx][:]
+ hi := mulTableHigh[idx][:]
+ copy(dstPart[:16], lo)
+ copy(dstPart[16:32], lo)
+ copy(dstPart[32:48], hi)
+ copy(dstPart[48:64], hi)
+ }
+ }
+ return dst
+}
+
+var gf2p811dMulMatrices = [256]uint64{0, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0, 0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0, 0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0, 0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0, 0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448, 0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468, 0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58, 0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78, 0x8810a8d83871e2c4, 0x8912acd02851a244, 0x8112a5cb061c284, 0x9132e54a0418204, 0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224, 0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14, 0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34, 0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c, 0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac, 0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c, 0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc, 0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122, 0x840895aed8b061c2, 0x850a91a6c8902142, 0x409172a50a04182, 0x50b132240800102, 0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932, 0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912, 0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa, 0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a, 0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba, 0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a, 0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6, 0xc183d76e0c18306, 0xd1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6, 0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6, 0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6, 0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e, 0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e, 0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e, 0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e, 0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1, 0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891, 0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1, 0x82048b95a850a041, 0x83068f9db870e0c1, 0x205091120408001, 0x3070d193060c081, 0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39, 0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19, 0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429, 0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409, 0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75, 0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55, 0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265, 0xa14234d90214285, 0xb16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245, 0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd, 0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd, 0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed, 0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd, 0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953, 0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973, 0x60c1e3b70e0c183, 0x70e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143, 0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163, 0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb, 0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb, 0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb, 0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb, 0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97, 0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7, 0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0xe1d3467c0810307, 0xf1f306fd0a14387, 0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7, 0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f, 0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f, 0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f, 0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f}
+
+func genGFNIMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []uint64) []uint64 {
+ if !avx2CodeGen {
+ panic("codegen not enabled")
+ }
+ total := inputs * outputs
+
+ // Duplicated in+out
+ dst = dst[:total]
+ for i, row := range matrixRows[:outputs] {
+ for j, idx := range row[inIdx : inIdx+inputs] {
+ dst[j*outputs+i] = gf2p811dMulMatrices[idx]
+ }
+ }
+ return dst
+}
+
+// xor slices writing to out.
+func sliceXorGo(in, out []byte, _ *options) {
+ for len(out) >= 32 {
+ inS := in[:32]
+ v0 := binary.LittleEndian.Uint64(out[:8]) ^ binary.LittleEndian.Uint64(inS[:8])
+ v1 := binary.LittleEndian.Uint64(out[8:16]) ^ binary.LittleEndian.Uint64(inS[8:16])
+ v2 := binary.LittleEndian.Uint64(out[16:24]) ^ binary.LittleEndian.Uint64(inS[16:24])
+ v3 := binary.LittleEndian.Uint64(out[24:32]) ^ binary.LittleEndian.Uint64(inS[24:32])
+ binary.LittleEndian.PutUint64(out[:8], v0)
+ binary.LittleEndian.PutUint64(out[8:16], v1)
+ binary.LittleEndian.PutUint64(out[16:24], v2)
+ binary.LittleEndian.PutUint64(out[24:32], v3)
+ out = out[32:]
+ in = in[32:]
+ }
+ out = out[:len(in)]
+ for n, input := range in {
+ out[n] ^= input
+ }
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
new file mode 100644
index 000000000..8099f1664
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
@@ -0,0 +1,583 @@
+//go:build !noasm && !appengine && !gccgo && !nopshufb
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package reedsolomon
+
+const pshufb = true
+
+//go:noescape
+func galMulSSSE3(low, high, in, out []byte)
+
+//go:noescape
+func galMulSSSE3Xor(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2Xor(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2Xor_64(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2_64(low, high, in, out []byte)
+
+// This is what the assembler routines do in blocks of 16 bytes:
+/*
+func galMulSSSE3(low, high, in, out []byte) {
+ for n, input := range in {
+ l := input & 0xf
+ h := input >> 4
+ out[n] = low[l] ^ high[h]
+ }
+}
+
+func galMulSSSE3Xor(low, high, in, out []byte) {
+ for n, input := range in {
+ l := input & 0xf
+ h := input >> 4
+ out[n] ^= low[l] ^ high[h]
+ }
+}
+*/
+
+// bigSwitchover is the size where 64 bytes are processed per loop.
+const bigSwitchover = 128
+
+func galMulSlice(c byte, in, out []byte, o *options) {
+ if c == 1 {
+ copy(out, in)
+ return
+ }
+ if o.useAVX2 {
+ if len(in) >= bigSwitchover {
+ galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+ done := (len(in) >> 6) << 6
+ in = in[done:]
+ out = out[done:]
+ }
+ if len(in) > 32 {
+ galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+ done := (len(in) >> 5) << 5
+ in = in[done:]
+ out = out[done:]
+ }
+ } else if o.useSSSE3 {
+ galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+ done := (len(in) >> 4) << 4
+ in = in[done:]
+ out = out[done:]
+ }
+ out = out[:len(in)]
+ mt := mulTable[c][:256]
+ for i := range in {
+ out[i] = mt[in[i]]
+ }
+}
+
+func galMulSliceXor(c byte, in, out []byte, o *options) {
+ if c == 1 {
+ sliceXor(in, out, o)
+ return
+ }
+
+ if o.useAVX2 {
+ if len(in) >= bigSwitchover {
+ galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+ done := (len(in) >> 6) << 6
+ in = in[done:]
+ out = out[done:]
+ }
+ if len(in) >= 32 {
+ galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+ done := (len(in) >> 5) << 5
+ in = in[done:]
+ out = out[done:]
+ }
+ } else if o.useSSSE3 {
+ galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+ done := (len(in) >> 4) << 4
+ in = in[done:]
+ out = out[done:]
+ }
+ if len(in) == 0 {
+ return
+ }
+ out = out[:len(in)]
+ mt := mulTable[c][:256]
+ for i := range in {
+ out[i] ^= mt[in[i]]
+ }
+}
+
+// simple slice xor
+func sliceXor(in, out []byte, o *options) {
+ if o.useSSE2 {
+ if len(in) >= bigSwitchover {
+ if o.useAVX2 {
+ avx2XorSlice_64(in, out)
+ done := (len(in) >> 6) << 6
+ in = in[done:]
+ out = out[done:]
+ } else {
+ sSE2XorSlice_64(in, out)
+ done := (len(in) >> 6) << 6
+ in = in[done:]
+ out = out[done:]
+ }
+ }
+ if len(in) >= 16 {
+ sSE2XorSlice(in, out)
+ done := (len(in) >> 4) << 4
+ in = in[done:]
+ out = out[done:]
+ }
+ } else {
+ sliceXorGo(in, out, o)
+ return
+ }
+ out = out[:len(in)]
+ for i := range in {
+ out[i] ^= in[i]
+ }
+}
+
+// 4-way butterfly
+func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ if len(work[0]) == 0 {
+ return
+ }
+
+ t01 := &multiply256LUT[log_m01]
+ t23 := &multiply256LUT[log_m23]
+ t02 := &multiply256LUT[log_m02]
+ if o.useAVX512 {
+ if log_m01 == modulus {
+ if log_m23 == modulus {
+ if log_m02 == modulus {
+ ifftDIT4_avx512_7(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT4_avx512_3(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m02 == modulus {
+ ifftDIT4_avx512_5(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT4_avx512_1(work, dist*24, t01, t23, t02)
+ }
+ }
+ } else {
+ if log_m23 == modulus {
+ if log_m02 == modulus {
+ ifftDIT4_avx512_6(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT4_avx512_2(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m02 == modulus {
+ ifftDIT4_avx512_4(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT4_avx512_0(work, dist*24, t01, t23, t02)
+ }
+ }
+ }
+ return
+ } else if o.useAVX2 {
+ if log_m01 == modulus {
+ if log_m23 == modulus {
+ if log_m02 == modulus {
+ ifftDIT4_avx2_7(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT4_avx2_3(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m02 == modulus {
+ ifftDIT4_avx2_5(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT4_avx2_1(work, dist*24, t01, t23, t02)
+ }
+ }
+ } else {
+ if log_m23 == modulus {
+ if log_m02 == modulus {
+ ifftDIT4_avx2_6(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT4_avx2_2(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m02 == modulus {
+ ifftDIT4_avx2_4(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT4_avx2_0(work, dist*24, t01, t23, t02)
+ }
+ }
+ }
+ return
+ }
+ ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ if len(work[0]) == 0 {
+ return
+ }
+
+ if false && o.useAvx512GFNI {
+ // Note that these currently require that length is multiple of 64.
+ t01 := gf2p811dMulMatrices[log_m01]
+ t23 := gf2p811dMulMatrices[log_m23]
+ t02 := gf2p811dMulMatrices[log_m02]
+ if log_m01 == modulus8 {
+ if log_m23 == modulus8 {
+ if log_m02 == modulus8 {
+ ifftDIT48_gfni_7(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT48_gfni_3(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m02 == modulus8 {
+ ifftDIT48_gfni_5(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT48_gfni_1(work, dist*24, t01, t23, t02)
+ }
+ }
+ } else {
+ if log_m23 == modulus8 {
+ if log_m02 == modulus8 {
+ ifftDIT48_gfni_6(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT48_gfni_2(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m02 == modulus8 {
+ ifftDIT48_gfni_4(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT48_gfni_0(work, dist*24, t01, t23, t02)
+ }
+ }
+ }
+ return
+ }
+ if o.useAVX2 {
+ // Note that these currently require that length is multiple of 64.
+ t01 := &multiply256LUT8[log_m01]
+ t23 := &multiply256LUT8[log_m23]
+ t02 := &multiply256LUT8[log_m02]
+ if log_m01 == modulus8 {
+ if log_m23 == modulus8 {
+ if log_m02 == modulus8 {
+ ifftDIT48_avx2_7(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT48_avx2_3(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m02 == modulus8 {
+ ifftDIT48_avx2_5(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT48_avx2_1(work, dist*24, t01, t23, t02)
+ }
+ }
+ } else {
+ if log_m23 == modulus8 {
+ if log_m02 == modulus8 {
+ ifftDIT48_avx2_6(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT48_avx2_2(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m02 == modulus8 {
+ ifftDIT48_avx2_4(work, dist*24, t01, t23, t02)
+ } else {
+ ifftDIT48_avx2_0(work, dist*24, t01, t23, t02)
+ }
+ }
+ }
+ return
+ }
+ ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ if len(work[0]) == 0 {
+ return
+ }
+
+ t01 := &multiply256LUT[log_m01]
+ t23 := &multiply256LUT[log_m23]
+ t02 := &multiply256LUT[log_m02]
+ if o.useAVX512 {
+ if log_m02 == modulus {
+ if log_m01 == modulus {
+ if log_m23 == modulus {
+ fftDIT4_avx512_7(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT4_avx512_3(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m23 == modulus {
+ fftDIT4_avx512_5(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT4_avx512_1(work, dist*24, t01, t23, t02)
+ }
+ }
+ } else {
+ if log_m01 == modulus {
+ if log_m23 == modulus {
+ fftDIT4_avx512_6(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT4_avx512_2(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m23 == modulus {
+ fftDIT4_avx512_4(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT4_avx512_0(work, dist*24, t01, t23, t02)
+ }
+ }
+ }
+ return
+ } else if o.useAVX2 {
+ if log_m02 == modulus {
+ if log_m01 == modulus {
+ if log_m23 == modulus {
+ fftDIT4_avx2_7(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT4_avx2_3(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m23 == modulus {
+ fftDIT4_avx2_5(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT4_avx2_1(work, dist*24, t01, t23, t02)
+ }
+ }
+ } else {
+ if log_m01 == modulus {
+ if log_m23 == modulus {
+ fftDIT4_avx2_6(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT4_avx2_2(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m23 == modulus {
+ fftDIT4_avx2_4(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT4_avx2_0(work, dist*24, t01, t23, t02)
+ }
+ }
+ }
+ return
+ }
+ fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ if len(work[0]) == 0 {
+ return
+ }
+
+ if false && o.useAvx512GFNI {
+ t01 := gf2p811dMulMatrices[log_m01]
+ t23 := gf2p811dMulMatrices[log_m23]
+ t02 := gf2p811dMulMatrices[log_m02]
+ // Note that these currently require that length is multiple of 64.
+ if log_m02 == modulus8 {
+ if log_m01 == modulus8 {
+ if log_m23 == modulus8 {
+ fftDIT48_gfni_7(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT48_gfni_3(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m23 == modulus8 {
+ fftDIT48_gfni_5(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT48_gfni_1(work, dist*24, t01, t23, t02)
+ }
+ }
+ } else {
+ if log_m01 == modulus8 {
+ if log_m23 == modulus8 {
+ fftDIT48_gfni_6(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT48_gfni_2(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m23 == modulus8 {
+ fftDIT48_gfni_4(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT48_gfni_0(work, dist*24, t01, t23, t02)
+ }
+ }
+ }
+ return
+ }
+ if o.useAVX2 {
+ t01 := &multiply256LUT8[log_m01]
+ t23 := &multiply256LUT8[log_m23]
+ t02 := &multiply256LUT8[log_m02]
+ // Note that these currently require that length is multiple of 64.
+ if log_m02 == modulus8 {
+ if log_m01 == modulus8 {
+ if log_m23 == modulus8 {
+ fftDIT48_avx2_7(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT48_avx2_3(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m23 == modulus8 {
+ fftDIT48_avx2_5(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT48_avx2_1(work, dist*24, t01, t23, t02)
+ }
+ }
+ } else {
+ if log_m01 == modulus8 {
+ if log_m23 == modulus8 {
+ fftDIT48_avx2_6(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT48_avx2_2(work, dist*24, t01, t23, t02)
+ }
+ } else {
+ if log_m23 == modulus8 {
+ fftDIT48_avx2_4(work, dist*24, t01, t23, t02)
+ } else {
+ fftDIT48_avx2_0(work, dist*24, t01, t23, t02)
+ }
+ }
+ }
+ return
+ }
+ fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 2-way butterfly forward
+func fftDIT2(x, y []byte, log_m ffe, o *options) {
+ if len(x) == 0 {
+ return
+ }
+ if o.useAVX2 {
+ tmp := &multiply256LUT[log_m]
+ fftDIT2_avx2(x, y, tmp)
+ } else if o.useSSSE3 {
+ tmp := &multiply256LUT[log_m]
+ fftDIT2_ssse3(x, y, tmp)
+ } else {
+ // Reference version:
+ refMulAdd(x, y, log_m)
+ sliceXor(x, y, o)
+ }
+}
+
+// 2-way butterfly forward
+func fftDIT28(x, y []byte, log_m ffe8, o *options) {
+ if len(x) == 0 {
+ return
+ }
+
+ if o.useAVX2 {
+ fftDIT28_avx2(x, y, &multiply256LUT8[log_m])
+ if len(x)&63 == 0 {
+ return
+ }
+ done := (len(y) >> 6) << 6
+ y = y[done:]
+ x = x[done:]
+ }
+ mulAdd8(x, y, log_m, o)
+ sliceXor(x, y, o)
+}
+
+// 2-way butterfly inverse
+func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
+ if len(x) == 0 {
+ return
+ }
+
+ if o.useAVX2 {
+ ifftDIT28_avx2(x, y, &multiply256LUT8[log_m])
+ if len(x)&63 == 0 {
+ return
+ }
+ done := (len(y) >> 6) << 6
+ y = y[done:]
+ x = x[done:]
+ }
+ sliceXor(x, y, o)
+ mulAdd8(x, y, log_m, o)
+}
+
+func mulAdd8(x, y []byte, log_m ffe8, o *options) {
+ if o.useAVX2 {
+ t := &multiply256LUT8[log_m]
+ galMulAVX2Xor_64(t[:16], t[16:32], y, x)
+ done := (len(y) >> 6) << 6
+ y = y[done:]
+ x = x[done:]
+ } else if o.useSSSE3 {
+ t := &multiply256LUT8[log_m]
+ galMulSSSE3Xor(t[:16], t[16:32], y, x)
+ done := (len(y) >> 4) << 4
+ y = y[done:]
+ x = x[done:]
+ }
+ refMulAdd8(x, y, log_m)
+}
+
+// 2-way butterfly
+func ifftDIT2(x, y []byte, log_m ffe, o *options) {
+ if len(x) == 0 {
+ return
+ }
+ if o.useAVX2 {
+ tmp := &multiply256LUT[log_m]
+ ifftDIT2_avx2(x, y, tmp)
+ } else if o.useSSSE3 {
+ tmp := &multiply256LUT[log_m]
+ ifftDIT2_ssse3(x, y, tmp)
+ } else {
+ // Reference version:
+ sliceXor(x, y, o)
+ refMulAdd(x, y, log_m)
+ }
+}
+
+func mulgf16(x, y []byte, log_m ffe, o *options) {
+ if len(x) == 0 {
+ return
+ }
+ if o.useAVX2 {
+ tmp := &multiply256LUT[log_m]
+ mulgf16_avx2(x, y, tmp)
+ } else if o.useSSSE3 {
+ tmp := &multiply256LUT[log_m]
+ mulgf16_ssse3(x, y, tmp)
+ } else {
+ refMul(x, y, log_m)
+ }
+}
+
+func mulgf8(out, in []byte, log_m ffe8, o *options) {
+ if o.useAVX2 {
+ t := &multiply256LUT8[log_m]
+ galMulAVX2_64(t[:16], t[16:32], in, out)
+ done := (len(in) >> 6) << 6
+ in = in[done:]
+ out = out[done:]
+ } else if o.useSSSE3 {
+ t := &multiply256LUT8[log_m]
+ galMulSSSE3(t[:16], t[16:32], in, out)
+ done := (len(in) >> 4) << 4
+ in = in[done:]
+ out = out[done:]
+ }
+ out = out[:len(in)]
+ mt := mul8LUTs[log_m].Value[:]
+ for i := range in {
+ out[i] = byte(mt[in[i]])
+ }
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
new file mode 100644
index 000000000..18e08c316
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
@@ -0,0 +1,310 @@
+//+build !noasm
+//+build !appengine
+//+build !gccgo
+//+build !nopshufb
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
+// and http://jerasure.org/jerasure/gf-complete/tree/master
+
+// func galMulSSSE3Xor(low, high, in, out []byte)
+TEXT ·galMulSSSE3Xor(SB), 7, $0
+ MOVQ low+0(FP), SI // SI: &low
+ MOVQ high+24(FP), DX // DX: &high
+ MOVOU (SI), X6 // X6 low
+ MOVOU (DX), X7 // X7: high
+ MOVQ $15, BX // BX: low mask
+ MOVQ BX, X8
+ PXOR X5, X5
+ MOVQ in+48(FP), SI // R11: &in
+ MOVQ in_len+56(FP), R9 // R9: len(in)
+ MOVQ out+72(FP), DX // DX: &out
+ PSHUFB X5, X8 // X8: lomask (unpacked)
+ SHRQ $4, R9 // len(in) / 16
+ MOVQ SI, AX
+ MOVQ DX, BX
+ ANDQ $15, AX
+ ANDQ $15, BX
+ CMPQ R9, $0
+ JEQ done_xor
+ ORQ AX, BX
+ CMPQ BX, $0
+ JNZ loopback_xor
+
+loopback_xor_aligned:
+ MOVOA (SI), X0 // in[x]
+ MOVOA (DX), X4 // out[x]
+ MOVOA X0, X1 // in[x]
+ MOVOA X6, X2 // low copy
+ MOVOA X7, X3 // high copy
+ PSRLQ $4, X1 // X1: high input
+ PAND X8, X0 // X0: low input
+ PAND X8, X1 // X0: high input
+ PSHUFB X0, X2 // X2: mul low part
+ PSHUFB X1, X3 // X3: mul high part
+ PXOR X2, X3 // X3: Result
+ PXOR X4, X3 // X3: Result xor existing out
+ MOVOA X3, (DX) // Store
+ ADDQ $16, SI // in+=16
+ ADDQ $16, DX // out+=16
+ SUBQ $1, R9
+ JNZ loopback_xor_aligned
+ JMP done_xor
+
+loopback_xor:
+ MOVOU (SI), X0 // in[x]
+ MOVOU (DX), X4 // out[x]
+ MOVOU X0, X1 // in[x]
+ MOVOU X6, X2 // low copy
+ MOVOU X7, X3 // high copy
+ PSRLQ $4, X1 // X1: high input
+ PAND X8, X0 // X0: low input
+ PAND X8, X1 // X0: high input
+ PSHUFB X0, X2 // X2: mul low part
+ PSHUFB X1, X3 // X3: mul high part
+ PXOR X2, X3 // X3: Result
+ PXOR X4, X3 // X3: Result xor existing out
+ MOVOU X3, (DX) // Store
+ ADDQ $16, SI // in+=16
+ ADDQ $16, DX // out+=16
+ SUBQ $1, R9
+ JNZ loopback_xor
+
+done_xor:
+ RET
+
+// func galMulSSSE3(low, high, in, out []byte)
+TEXT ·galMulSSSE3(SB), 7, $0
+ MOVQ low+0(FP), SI // SI: &low
+ MOVQ high+24(FP), DX // DX: &high
+ MOVOU (SI), X6 // X6 low
+ MOVOU (DX), X7 // X7: high
+ MOVQ $15, BX // BX: low mask
+ MOVQ BX, X8
+ PXOR X5, X5
+ MOVQ in+48(FP), SI // R11: &in
+ MOVQ in_len+56(FP), R9 // R9: len(in)
+ MOVQ out+72(FP), DX // DX: &out
+ PSHUFB X5, X8 // X8: lomask (unpacked)
+ MOVQ SI, AX
+ MOVQ DX, BX
+ SHRQ $4, R9 // len(in) / 16
+ ANDQ $15, AX
+ ANDQ $15, BX
+ CMPQ R9, $0
+ JEQ done
+ ORQ AX, BX
+ CMPQ BX, $0
+ JNZ loopback
+
+loopback_aligned:
+ MOVOA (SI), X0 // in[x]
+ MOVOA X0, X1 // in[x]
+ MOVOA X6, X2 // low copy
+ MOVOA X7, X3 // high copy
+ PSRLQ $4, X1 // X1: high input
+ PAND X8, X0 // X0: low input
+ PAND X8, X1 // X0: high input
+ PSHUFB X0, X2 // X2: mul low part
+ PSHUFB X1, X3 // X3: mul high part
+ PXOR X2, X3 // X3: Result
+ MOVOA X3, (DX) // Store
+ ADDQ $16, SI // in+=16
+ ADDQ $16, DX // out+=16
+ SUBQ $1, R9
+ JNZ loopback_aligned
+ JMP done
+
+loopback:
+ MOVOU (SI), X0 // in[x]
+ MOVOU X0, X1 // in[x]
+ MOVOA X6, X2 // low copy
+ MOVOA X7, X3 // high copy
+ PSRLQ $4, X1 // X1: high input
+ PAND X8, X0 // X0: low input
+ PAND X8, X1 // X0: high input
+ PSHUFB X0, X2 // X2: mul low part
+ PSHUFB X1, X3 // X3: mul high part
+ PXOR X2, X3 // X3: Result
+ MOVOU X3, (DX) // Store
+ ADDQ $16, SI // in+=16
+ ADDQ $16, DX // out+=16
+ SUBQ $1, R9
+ JNZ loopback
+
+done:
+ RET
+
+// func galMulAVX2Xor(low, high, in, out []byte)
+TEXT ·galMulAVX2Xor(SB), 7, $0
+ MOVQ low+0(FP), SI // SI: &low
+ MOVQ high+24(FP), DX // DX: &high
+ MOVQ $15, BX // BX: low mask
+ MOVQ BX, X5
+ MOVOU (SI), X6 // X6: low
+ MOVOU (DX), X7 // X7: high
+ MOVQ in_len+56(FP), R9 // R9: len(in)
+
+ VINSERTI128 $1, X6, Y6, Y6 // low
+ VINSERTI128 $1, X7, Y7, Y7 // high
+ VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
+
+ SHRQ $5, R9 // len(in) / 32
+ MOVQ out+72(FP), DX // DX: &out
+ MOVQ in+48(FP), SI // SI: &in
+ TESTQ R9, R9
+ JZ done_xor_avx2
+
+loopback_xor_avx2:
+ VMOVDQU (SI), Y0
+ VMOVDQU (DX), Y4
+ VPSRLQ $4, Y0, Y1 // Y1: high input
+ VPAND Y8, Y0, Y0 // Y0: low input
+ VPAND Y8, Y1, Y1 // Y1: high input
+ VPSHUFB Y0, Y6, Y2 // Y2: mul low part
+ VPSHUFB Y1, Y7, Y3 // Y3: mul high part
+ VPXOR Y3, Y2, Y3 // Y3: Result
+ VPXOR Y4, Y3, Y4 // Y4: Result
+ VMOVDQU Y4, (DX)
+
+ ADDQ $32, SI // in+=32
+ ADDQ $32, DX // out+=32
+ SUBQ $1, R9
+ JNZ loopback_xor_avx2
+
+done_xor_avx2:
+ VZEROUPPER
+ RET
+
+// func galMulAVX2(low, high, in, out []byte)
+TEXT ·galMulAVX2(SB), 7, $0
+ MOVQ low+0(FP), SI // SI: &low
+ MOVQ high+24(FP), DX // DX: &high
+ MOVQ $15, BX // BX: low mask
+ MOVQ BX, X5
+ MOVOU (SI), X6 // X6: low
+ MOVOU (DX), X7 // X7: high
+ MOVQ in_len+56(FP), R9 // R9: len(in)
+
+ VINSERTI128 $1, X6, Y6, Y6 // low
+ VINSERTI128 $1, X7, Y7, Y7 // high
+ VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
+
+ SHRQ $5, R9 // len(in) / 32
+ MOVQ out+72(FP), DX // DX: &out
+ MOVQ in+48(FP), SI // SI: &in
+ TESTQ R9, R9
+ JZ done_avx2
+
+loopback_avx2:
+ VMOVDQU (SI), Y0
+ VPSRLQ $4, Y0, Y1 // Y1: high input
+ VPAND Y8, Y0, Y0 // Y0: low input
+ VPAND Y8, Y1, Y1 // Y1: high input
+ VPSHUFB Y0, Y6, Y2 // Y2: mul low part
+ VPSHUFB Y1, Y7, Y3 // Y3: mul high part
+ VPXOR Y3, Y2, Y4 // Y4: Result
+ VMOVDQU Y4, (DX)
+
+ ADDQ $32, SI // in+=32
+ ADDQ $32, DX // out+=32
+ SUBQ $1, R9
+ JNZ loopback_avx2
+
+done_avx2:
+ VZEROUPPER
+ RET
+
+// func galMulAVX2Xor_64(low, high, in, out []byte)
+TEXT ·galMulAVX2Xor_64(SB), 7, $0
+ MOVQ low+0(FP), SI // SI: &low
+ MOVQ high+24(FP), DX // DX: &high
+ MOVQ $15, BX // BX: low mask
+ MOVQ BX, X5
+ MOVQ in_len+56(FP), R9 // R9: len(in)
+
+ VBROADCASTI128 (SI), Y6 // low table
+ VBROADCASTI128 (DX), Y7 // high high table
+ VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
+
+ SHRQ $6, R9 // len(in) / 64
+ MOVQ out+72(FP), DX // DX: &out
+ MOVQ in+48(FP), SI // SI: &in
+ TESTQ R9, R9
+ JZ done_xor_avx2_64
+
+loopback_xor_avx2_64:
+ VMOVDQU (SI), Y0
+ VMOVDQU 32(SI), Y10
+ VMOVDQU (DX), Y4
+ VMOVDQU 32(DX), Y14
+ VPSRLQ $4, Y0, Y1 // Y1: high input
+ VPSRLQ $4, Y10, Y11 // Y11: high input 2
+ VPAND Y8, Y0, Y0 // Y0: low input
+ VPAND Y8, Y10, Y10 // Y10: low input 2
+ VPAND Y8, Y1, Y1 // Y11: high input
+ VPAND Y8, Y11, Y11 // Y11: high input 2
+ VPSHUFB Y0, Y6, Y2 // Y2: mul low part
+ VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2
+ VPSHUFB Y1, Y7, Y3 // Y3: mul high part
+ VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2
+ VPXOR Y3, Y2, Y3 // Y3: Result
+ VPXOR Y13, Y12, Y13 // Y13: Result 2
+ VPXOR Y4, Y3, Y4 // Y4: Result
+ VPXOR Y14, Y13, Y14 // Y4: Result 2
+ VMOVDQU Y4, (DX)
+ VMOVDQU Y14, 32(DX)
+
+ ADDQ $64, SI // in+=64
+ ADDQ $64, DX // out+=64
+ SUBQ $1, R9
+ JNZ loopback_xor_avx2_64
+
+done_xor_avx2_64:
+ VZEROUPPER
+ RET
+
+// func galMulAVX2_64(low, high, in, out []byte)
+TEXT ·galMulAVX2_64(SB), 7, $0
+ MOVQ low+0(FP), SI // SI: &low
+ MOVQ high+24(FP), DX // DX: &high
+ MOVQ $15, BX // BX: low mask
+ MOVQ BX, X5
+ MOVQ in_len+56(FP), R9 // R9: len(in)
+ VBROADCASTI128 (SI), Y6 // low table
+ VBROADCASTI128 (DX), Y7 // high high table
+ VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
+
+ SHRQ $6, R9 // len(in) / 64
+ MOVQ out+72(FP), DX // DX: &out
+ MOVQ in+48(FP), SI // SI: &in
+ TESTQ R9, R9
+ JZ done_avx2_64
+
+loopback_avx2_64:
+ VMOVDQU (SI), Y0
+ VMOVDQU 32(SI), Y10
+ VPSRLQ $4, Y0, Y1 // Y1: high input
+ VPSRLQ $4, Y10, Y11 // Y11: high input 2
+ VPAND Y8, Y0, Y0 // Y0: low input
+ VPAND Y8, Y10, Y10 // Y10: low input
+ VPAND Y8, Y1, Y1 // Y1: high input
+ VPAND Y8, Y11, Y11 // Y11: high input 2
+ VPSHUFB Y0, Y6, Y2 // Y2: mul low part
+ VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2
+ VPSHUFB Y1, Y7, Y3 // Y3: mul high part
+ VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2
+ VPXOR Y3, Y2, Y4 // Y4: Result
+ VPXOR Y13, Y12, Y14 // Y14: Result 2
+ VMOVDQU Y4, (DX)
+ VMOVDQU Y14, 32(DX)
+
+ ADDQ $64, SI // in+=64
+ ADDQ $64, DX // out+=64
+ SUBQ $1, R9
+ JNZ loopback_avx2_64
+
+done_avx2_64:
+ VZEROUPPER
+ RET
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
new file mode 100644
index 000000000..8ef402bf0
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
@@ -0,0 +1,130 @@
+//go:build !noasm && !appengine && !gccgo && !nopshufb
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2017, Minio, Inc.
+
+package reedsolomon
+
+const pshufb = true
+
+//go:noescape
+func galMulNEON(low, high, in, out []byte)
+
+//go:noescape
+func galMulXorNEON(low, high, in, out []byte)
+
+func galMulSlice(c byte, in, out []byte, o *options) {
+ if c == 1 {
+ copy(out, in)
+ return
+ }
+ var done int
+ galMulNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+ done = (len(in) >> 5) << 5
+
+ remain := len(in) - done
+ if remain > 0 {
+ mt := mulTable[c][:256]
+ for i := done; i < len(in); i++ {
+ out[i] = mt[in[i]]
+ }
+ }
+}
+
+func galMulSliceXor(c byte, in, out []byte, o *options) {
+ if c == 1 {
+ sliceXor(in, out, o)
+ return
+ }
+ var done int
+ galMulXorNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+ done = (len(in) >> 5) << 5
+
+ remain := len(in) - done
+ if remain > 0 {
+ mt := mulTable[c][:256]
+ for i := done; i < len(in); i++ {
+ out[i] ^= mt[in[i]]
+ }
+ }
+}
+
+// 4-way butterfly
+func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 2-way butterfly forward
+func fftDIT2(x, y []byte, log_m ffe, o *options) {
+ // Reference version:
+ refMulAdd(x, y, log_m)
+ // 64 byte aligned, always full.
+ xorSliceNEON(x, y)
+}
+
+// 2-way butterfly forward
+func fftDIT28(x, y []byte, log_m ffe8, o *options) {
+ // Reference version:
+ mulAdd8(x, y, log_m, o)
+ sliceXor(x, y, o)
+}
+
+// 2-way butterfly
+func ifftDIT2(x, y []byte, log_m ffe, o *options) {
+ // 64 byte aligned, always full.
+ xorSliceNEON(x, y)
+ // Reference version:
+ refMulAdd(x, y, log_m)
+}
+
+// 2-way butterfly inverse
+func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
+ // Reference version:
+ sliceXor(x, y, o)
+ mulAdd8(x, y, log_m, o)
+}
+
+func mulgf16(x, y []byte, log_m ffe, o *options) {
+ refMul(x, y, log_m)
+}
+
+func mulAdd8(out, in []byte, log_m ffe8, o *options) {
+ t := &multiply256LUT8[log_m]
+ galMulXorNEON(t[:16], t[16:32], in, out)
+ done := (len(in) >> 5) << 5
+ in = in[done:]
+ if len(in) > 0 {
+ out = out[done:]
+ refMulAdd8(in, out, log_m)
+ }
+}
+
+func mulgf8(out, in []byte, log_m ffe8, o *options) {
+ var done int
+ t := &multiply256LUT8[log_m]
+ galMulNEON(t[:16], t[16:32], in, out)
+ done = (len(in) >> 5) << 5
+
+ remain := len(in) - done
+ if remain > 0 {
+ mt := mul8LUTs[log_m].Value[:]
+ for i := done; i < len(in); i++ {
+ out[i] ^= byte(mt[in[i]])
+ }
+ }
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.s b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
new file mode 100644
index 000000000..772dfac96
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
@@ -0,0 +1,102 @@
+//+build !noasm
+//+build !appengine
+//+build !gccgo
+//+build !nopshufb
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2017, Minio, Inc.
+
+#define LOAD(LO1, LO2, HI1, HI2) \
+ VLD1.P 32(R1), [LO1.B16, LO2.B16] \
+ \
+ \ // Get low input and high input
+ VUSHR $4, LO1.B16, HI1.B16 \
+ VUSHR $4, LO2.B16, HI2.B16 \
+ VAND V8.B16, LO1.B16, LO1.B16 \
+ VAND V8.B16, LO2.B16, LO2.B16
+
+#define GALOIS_MUL(MUL_LO, MUL_HI, OUT1, OUT2, TMP1, TMP2) \
+ \ // Mul low part and mul high part
+ VTBL V0.B16, [MUL_LO.B16], OUT1.B16 \
+ VTBL V10.B16, [MUL_HI.B16], OUT2.B16 \
+ VTBL V1.B16, [MUL_LO.B16], TMP1.B16 \
+ VTBL V11.B16, [MUL_HI.B16], TMP2.B16 \
+ \
+ \ // Combine results
+ VEOR OUT2.B16, OUT1.B16, OUT1.B16 \
+ VEOR TMP2.B16, TMP1.B16, OUT2.B16
+
+// func galMulNEON(low, high, in, out []byte)
+TEXT ·galMulNEON(SB), 7, $0
+ MOVD in_base+48(FP), R1
+ MOVD in_len+56(FP), R2 // length of message
+ MOVD out_base+72(FP), R5
+ SUBS $32, R2
+ BMI complete
+
+ MOVD low+0(FP), R10 // R10: &low
+ MOVD high+24(FP), R11 // R11: &high
+ VLD1 (R10), [V6.B16]
+ VLD1 (R11), [V7.B16]
+
+ //
+ // Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error
+ // WORD $0x4e010c68 // dup v8.16b, w3
+ //
+ MOVD $0x0f, R3
+ VMOV R3, V8.B[0]
+ VDUP V8.B[0], V8.B16
+
+loop:
+ // Main loop
+ LOAD(V0, V1, V10, V11)
+ GALOIS_MUL(V6, V7, V4, V5, V14, V15)
+
+ // Store result
+ VST1.P [V4.D2, V5.D2], 32(R5)
+
+ SUBS $32, R2
+ BPL loop
+
+complete:
+ RET
+
+// func galMulXorNEON(low, high, in, out []byte)
+TEXT ·galMulXorNEON(SB), 7, $0
+ MOVD in_base+48(FP), R1
+ MOVD in_len+56(FP), R2 // length of message
+ MOVD out_base+72(FP), R5
+ SUBS $32, R2
+ BMI completeXor
+
+ MOVD low+0(FP), R10 // R10: &low
+ MOVD high+24(FP), R11 // R11: &high
+ VLD1 (R10), [V6.B16]
+ VLD1 (R11), [V7.B16]
+
+ //
+ // Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error
+ // WORD $0x4e010c68 // dup v8.16b, w3
+ //
+ MOVD $0x0f, R3
+ VMOV R3, V8.B[0]
+ VDUP V8.B[0], V8.B16
+
+loopXor:
+ // Main loop
+ VLD1 (R5), [V20.B16, V21.B16]
+
+ LOAD(V0, V1, V10, V11)
+ GALOIS_MUL(V6, V7, V4, V5, V14, V15)
+
+ VEOR V20.B16, V4.B16, V4.B16
+ VEOR V21.B16, V5.B16, V5.B16
+
+ // Store result
+ VST1.P [V4.D2, V5.D2], 32(R5)
+
+ SUBS $32, R2
+ BPL loopXor
+
+completeXor:
+ RET
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go
new file mode 100644
index 000000000..dac9b1363
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go
@@ -0,0 +1,3532 @@
+// Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT.
+
+//go:build !appengine && !noasm && !nogen && !nopshufb && gc
+
+package reedsolomon
+
+func _dummy_()
+
+//go:noescape
+func sSE2XorSlice(in []byte, out []byte)
+
+//go:noescape
+func sSE2XorSlice_64(in []byte, out []byte)
+
+//go:noescape
+func avx2XorSlice_64(in []byte, out []byte)
+
+// mulAvxTwo_1x1_64 takes 1 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x1_64 takes 1 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x1 takes 1 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x1Xor takes 1 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x1_64Xor takes 1 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x2_64 takes 1 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x2_64 takes 1 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x2 takes 1 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x2Xor takes 1 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x2_64Xor takes 1 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x3_64 takes 1 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x3_64 takes 1 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x3 takes 1 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x3Xor takes 1 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x3_64Xor takes 1 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x4 takes 1 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x4_64 takes 1 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x4 takes 1 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x4Xor takes 1 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x4Xor takes 1 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x5 takes 1 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x5_64 takes 1 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x5 takes 1 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x5Xor takes 1 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x5Xor takes 1 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x6 takes 1 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x6_64 takes 1 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x6 takes 1 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x6Xor takes 1 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x6Xor takes 1 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x7 takes 1 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x7_64 takes 1 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x7 takes 1 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x7Xor takes 1 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x7Xor takes 1 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x8 takes 1 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x8_64 takes 1 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x8 takes 1 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x8Xor takes 1 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x8Xor takes 1 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x9 takes 1 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x9_64 takes 1 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x9 takes 1 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x9Xor takes 1 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x9Xor takes 1 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x10 takes 1 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x10_64 takes 1 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x10 takes 1 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x10Xor takes 1 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x1_64 takes 2 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x1_64 takes 2 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x1 takes 2 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x1Xor takes 2 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x1_64Xor takes 2 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x2_64 takes 2 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x2_64 takes 2 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x2 takes 2 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x2Xor takes 2 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x2_64Xor takes 2 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x3_64 takes 2 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x3_64 takes 2 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x3 takes 2 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x3Xor takes 2 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x3_64Xor takes 2 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x4 takes 2 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x4_64 takes 2 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x4 takes 2 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x4Xor takes 2 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x4Xor takes 2 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x5 takes 2 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x5_64 takes 2 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x5 takes 2 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x5Xor takes 2 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x5Xor takes 2 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x6 takes 2 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x6_64 takes 2 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x6 takes 2 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x6Xor takes 2 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x6Xor takes 2 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x7 takes 2 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x7_64 takes 2 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x7 takes 2 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x7Xor takes 2 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x7Xor takes 2 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x8 takes 2 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x8_64 takes 2 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x8 takes 2 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x8Xor takes 2 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x8Xor takes 2 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x9 takes 2 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x9_64 takes 2 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x9 takes 2 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x9Xor takes 2 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x9Xor takes 2 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x10 takes 2 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x10_64 takes 2 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x10 takes 2 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x10Xor takes 2 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x1_64 takes 3 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x1_64 takes 3 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x1 takes 3 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x1Xor takes 3 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x1_64Xor takes 3 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x2_64 takes 3 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x2_64 takes 3 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x2 takes 3 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x2Xor takes 3 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x2_64Xor takes 3 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x3_64 takes 3 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x3_64 takes 3 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x3 takes 3 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x3Xor takes 3 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x3_64Xor takes 3 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x4 takes 3 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x4_64 takes 3 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x4 takes 3 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x4Xor takes 3 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x4Xor takes 3 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x5 takes 3 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x5_64 takes 3 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x5 takes 3 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x5Xor takes 3 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x5Xor takes 3 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x6 takes 3 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x6_64 takes 3 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x6 takes 3 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x6Xor takes 3 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x6Xor takes 3 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x7 takes 3 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x7_64 takes 3 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x7 takes 3 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x7Xor takes 3 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x7Xor takes 3 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x8 takes 3 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x8_64 takes 3 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x8 takes 3 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x8Xor takes 3 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x8Xor takes 3 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x9 takes 3 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x9_64 takes 3 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x9 takes 3 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x9Xor takes 3 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x9Xor takes 3 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x10 takes 3 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x10_64 takes 3 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x10 takes 3 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x10Xor takes 3 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x1_64 takes 4 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x1_64 takes 4 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x1 takes 4 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x1Xor takes 4 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x1_64Xor takes 4 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x2_64 takes 4 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x2_64 takes 4 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x2 takes 4 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x2Xor takes 4 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x2_64Xor takes 4 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x3_64 takes 4 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x3_64 takes 4 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x3 takes 4 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x3Xor takes 4 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x3_64Xor takes 4 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x4 takes 4 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x4_64 takes 4 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x4 takes 4 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x4Xor takes 4 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x4Xor takes 4 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x5 takes 4 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x5_64 takes 4 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x5 takes 4 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x5Xor takes 4 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x5Xor takes 4 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x6 takes 4 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x6_64 takes 4 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x6 takes 4 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x6Xor takes 4 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x7 takes 4 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x7_64 takes 4 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x7 takes 4 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x7Xor takes 4 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x8_64 takes 4 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x8 takes 4 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x8Xor takes 4 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x8Xor takes 4 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x9 takes 4 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x9_64 takes 4 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x9 takes 4 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x9Xor takes 4 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x9Xor takes 4 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x10 takes 4 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x10_64 takes 4 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x10 takes 4 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x10Xor takes 4 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x1_64 takes 5 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x1_64 takes 5 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x1 takes 5 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x1Xor takes 5 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x1_64Xor takes 5 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x2_64 takes 5 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x2_64 takes 5 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x2 takes 5 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x2Xor takes 5 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x2_64Xor takes 5 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x3_64 takes 5 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x3_64 takes 5 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x3 takes 5 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x3Xor takes 5 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x3_64Xor takes 5 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x4 takes 5 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x4_64 takes 5 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x4 takes 5 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x4Xor takes 5 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x4Xor takes 5 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x5 takes 5 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x5_64 takes 5 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x5 takes 5 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x5Xor takes 5 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x5Xor takes 5 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x6 takes 5 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x6_64 takes 5 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x6 takes 5 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x6Xor takes 5 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x6Xor takes 5 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x7 takes 5 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x7_64 takes 5 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x7 takes 5 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x7Xor takes 5 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x7Xor takes 5 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x8 takes 5 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x8_64 takes 5 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x8 takes 5 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x8Xor takes 5 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x8Xor takes 5 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x9 takes 5 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x9_64 takes 5 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x9 takes 5 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x9Xor takes 5 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x9Xor takes 5 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x10 takes 5 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x10_64 takes 5 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x10 takes 5 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x10Xor takes 5 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x1_64 takes 6 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x1_64 takes 6 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x1 takes 6 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x1Xor takes 6 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x1_64Xor takes 6 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x2_64 takes 6 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x2_64 takes 6 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x2 takes 6 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x2Xor takes 6 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x2_64Xor takes 6 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x3_64 takes 6 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x3_64 takes 6 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x3 takes 6 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x3Xor takes 6 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x3_64Xor takes 6 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x4 takes 6 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x4_64 takes 6 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x4 takes 6 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x4Xor takes 6 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x4Xor takes 6 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x5 takes 6 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x5_64 takes 6 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x5 takes 6 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x5Xor takes 6 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x5Xor takes 6 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x6 takes 6 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x6_64 takes 6 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x6 takes 6 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x6Xor takes 6 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x6Xor takes 6 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x7 takes 6 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x7_64 takes 6 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x7 takes 6 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x7Xor takes 6 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x7Xor takes 6 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x8 takes 6 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x8_64 takes 6 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x8 takes 6 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x8Xor takes 6 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x8Xor takes 6 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x9 takes 6 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x9_64 takes 6 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x9 takes 6 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x9Xor takes 6 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x9Xor takes 6 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x10 takes 6 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x10_64 takes 6 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x10 takes 6 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x10Xor takes 6 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x1_64 takes 7 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x1_64 takes 7 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x1 takes 7 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x1Xor takes 7 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x1_64Xor takes 7 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x2_64 takes 7 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x2_64 takes 7 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x2 takes 7 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x2Xor takes 7 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x2_64Xor takes 7 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x3_64 takes 7 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x3_64 takes 7 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x3 takes 7 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x3Xor takes 7 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x3_64Xor takes 7 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x4 takes 7 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x4_64 takes 7 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x4 takes 7 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x4Xor takes 7 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x4Xor takes 7 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x5 takes 7 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x5_64 takes 7 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x5 takes 7 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x5Xor takes 7 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x5Xor takes 7 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x6 takes 7 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x6_64 takes 7 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x6 takes 7 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x6Xor takes 7 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x6Xor takes 7 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x7 takes 7 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x7_64 takes 7 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x7 takes 7 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x7Xor takes 7 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x7Xor takes 7 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x8 takes 7 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x8_64 takes 7 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x8 takes 7 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x8Xor takes 7 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x8Xor takes 7 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x9 takes 7 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x9_64 takes 7 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x9 takes 7 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x9Xor takes 7 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x9Xor takes 7 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x10 takes 7 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x10_64 takes 7 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x10 takes 7 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x10Xor takes 7 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x1_64 takes 8 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x1_64 takes 8 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x1 takes 8 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x1Xor takes 8 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x1_64Xor takes 8 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x2_64 takes 8 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x2_64 takes 8 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x2 takes 8 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x2Xor takes 8 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x2_64Xor takes 8 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x3_64 takes 8 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x3_64 takes 8 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x3 takes 8 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x3Xor takes 8 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x3_64Xor takes 8 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x4 takes 8 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x4_64 takes 8 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x4 takes 8 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x4Xor takes 8 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x4Xor takes 8 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x5 takes 8 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x5_64 takes 8 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x5 takes 8 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x5Xor takes 8 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x5Xor takes 8 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x6 takes 8 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x6_64 takes 8 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x6 takes 8 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x6Xor takes 8 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x6Xor takes 8 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x7 takes 8 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x7_64 takes 8 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x7 takes 8 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x7Xor takes 8 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x7Xor takes 8 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x8 takes 8 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x8_64 takes 8 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x8 takes 8 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x8Xor takes 8 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x8Xor takes 8 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x9 takes 8 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x9_64 takes 8 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x9 takes 8 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x9Xor takes 8 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x9Xor takes 8 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x10 takes 8 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x10_64 takes 8 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x10 takes 8 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x10Xor takes 8 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x1_64 takes 9 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x1_64 takes 9 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x1 takes 9 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x1Xor takes 9 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x1_64Xor takes 9 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x2_64 takes 9 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x2_64 takes 9 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x2 takes 9 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x2Xor takes 9 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x2_64Xor takes 9 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x3_64 takes 9 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x3_64 takes 9 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x3 takes 9 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x3Xor takes 9 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x3_64Xor takes 9 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x4 takes 9 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x4_64 takes 9 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x4 takes 9 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x4Xor takes 9 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x4Xor takes 9 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x5 takes 9 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x5_64 takes 9 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x5 takes 9 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x5Xor takes 9 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x5Xor takes 9 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x6 takes 9 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x6_64 takes 9 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x6 takes 9 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x6Xor takes 9 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x6Xor takes 9 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x7 takes 9 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x7_64 takes 9 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x7 takes 9 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x7Xor takes 9 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x7Xor takes 9 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x8 takes 9 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x8_64 takes 9 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x8 takes 9 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x8Xor takes 9 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x8Xor takes 9 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x9 takes 9 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x9_64 takes 9 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x9 takes 9 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x9Xor takes 9 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x9Xor takes 9 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x10 takes 9 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x10_64 takes 9 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x10 takes 9 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x10Xor takes 9 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x1_64 takes 10 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x1_64 takes 10 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x1 takes 10 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x1Xor takes 10 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x1_64Xor takes 10 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x2_64 takes 10 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x2_64 takes 10 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x2 takes 10 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x2Xor takes 10 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x2_64Xor takes 10 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x3_64 takes 10 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x3_64 takes 10 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x3 takes 10 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x3Xor takes 10 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x3_64Xor takes 10 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x4 takes 10 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x4_64 takes 10 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x4 takes 10 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x4Xor takes 10 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x4Xor takes 10 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x5 takes 10 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x5_64 takes 10 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x5 takes 10 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x5Xor takes 10 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x5Xor takes 10 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x6 takes 10 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x6_64 takes 10 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x6 takes 10 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x6Xor takes 10 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x6Xor takes 10 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x7 takes 10 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x7_64 takes 10 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x7 takes 10 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x7Xor takes 10 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x7Xor takes 10 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x8 takes 10 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x8_64 takes 10 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x8 takes 10 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x8Xor takes 10 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x8Xor takes 10 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x9 takes 10 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x9_64 takes 10 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x9 takes 10 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x9Xor takes 10 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x9Xor takes 10 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x10 takes 10 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x10_64 takes 10 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x10 takes 10 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x10Xor takes 10 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+
+//go:noescape
+func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8)
+
+//go:noescape
+func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8)
+
+//go:noescape
+func mulgf16_avx2(x []byte, y []byte, table *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func fftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+
+//go:noescape
+func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8)
+
+//go:noescape
+func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8)
+
+//go:noescape
+func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8)
+
+//go:noescape
+func ifftDIT28_avx2(x []byte, y []byte, table *[32]uint8)
+
+//go:noescape
+func fftDIT28_avx2(x []byte, y []byte, table *[32]uint8)
+
+//go:noescape
+func ifftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func fftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func fftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func fftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+
+//go:noescape
+func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s
new file mode 100644
index 000000000..ad253a65a
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s
@@ -0,0 +1,128293 @@
+// Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT.
+
+//go:build !appengine && !noasm && !nogen && !nopshufb && gc
+
+#include "textflag.h"
+
+// func _dummy_()
+TEXT ·_dummy_(SB), $0
+#ifdef GOAMD64_v4
+#define XOR3WAY(ignore, a, b, dst) \
+ VPTERNLOGD $0x96, a, b, dst
+
+#else
+#define XOR3WAY(ignore, a, b, dst) \
+ VPXOR a, dst, dst \
+ VPXOR b, dst, dst
+
+#endif
+ RET
+
+// sSE2XorSlice will XOR in with out and store in out.
+// Processes 16 bytes/loop.
+
+// func sSE2XorSlice(in []byte, out []byte)
+// Requires: SSE2
+TEXT ·sSE2XorSlice(SB), $0-48
+ MOVQ in_base+0(FP), AX
+ MOVQ out_base+24(FP), CX
+ MOVQ in_len+8(FP), DX
+ SHRQ $0x04, DX
+ JZ end
+
+loop:
+ MOVOU (AX), X0
+ MOVOU (CX), X1
+ PXOR X0, X1
+ MOVOU X1, (CX)
+ ADDQ $0x10, AX
+ ADDQ $0x10, CX
+ DECQ DX
+ JNZ loop
+
+end:
+ RET
+
+// sSE2XorSlice_64 will XOR in with out and store in out.
+// Processes 64 bytes/loop.
+
+// func sSE2XorSlice_64(in []byte, out []byte)
+// Requires: SSE2
+TEXT ·sSE2XorSlice_64(SB), $0-48
+ MOVQ in_base+0(FP), AX
+ MOVQ out_base+24(FP), CX
+ MOVQ in_len+8(FP), DX
+ SHRQ $0x06, DX
+ JZ end
+
+loop:
+ MOVOU (AX), X0
+ MOVOU 16(AX), X2
+ MOVOU 32(AX), X4
+ MOVOU 48(AX), X6
+ MOVOU (CX), X1
+ MOVOU 16(CX), X3
+ MOVOU 32(CX), X5
+ MOVOU 48(CX), X7
+ PXOR X0, X1
+ PXOR X2, X3
+ PXOR X4, X5
+ PXOR X6, X7
+ MOVOU X1, (CX)
+ MOVOU X3, 16(CX)
+ MOVOU X5, 32(CX)
+ MOVOU X7, 48(CX)
+ ADDQ $0x40, AX
+ ADDQ $0x40, CX
+ DECQ DX
+ JNZ loop
+
+end:
+ RET
+
+// avx2XorSlice_64 will XOR in with out and store in out.
+// Processes 64 bytes/loop.
+
+// func avx2XorSlice_64(in []byte, out []byte)
+// Requires: AVX, AVX2
+TEXT ·avx2XorSlice_64(SB), $0-48
+ MOVQ in_base+0(FP), AX
+ MOVQ out_base+24(FP), CX
+ MOVQ in_len+8(FP), DX
+ SHRQ $0x06, DX
+ JZ end
+
+loop:
+ VMOVDQU (AX), Y0
+ VMOVDQU 32(AX), Y2
+ VMOVDQU (CX), Y1
+ VMOVDQU 32(CX), Y3
+ VPXOR Y0, Y1, Y1
+ VPXOR Y2, Y3, Y3
+ VMOVDQU Y1, (CX)
+ VMOVDQU Y3, 32(CX)
+ ADDQ $0x40, AX
+ ADDQ $0x40, CX
+ DECQ DX
+ JNZ loop
+
+end:
+ VZEROUPPER
+ RET
+
+// func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x1_64_end
+ VMOVDQU (CX), Y0
+ VMOVDQU 32(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_1x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (CX), Y2
+ VMOVDQU 32(CX), Y3
+ ADDQ $0x40, CX
+ VPSRLQ $0x04, Y2, Y6
+ VPSRLQ $0x04, Y3, Y5
+ VPAND Y4, Y2, Y2
+ VPAND Y4, Y3, Y3
+ VPAND Y4, Y6, Y6
+ VPAND Y4, Y5, Y5
+ VPSHUFB Y2, Y0, Y2
+ VPSHUFB Y3, Y0, Y3
+ VPSHUFB Y6, Y1, Y6
+ VPSHUFB Y5, Y1, Y5
+ VPXOR Y2, Y6, Y2
+ VPXOR Y3, Y5, Y3
+
+ // Store 1 outputs
+ VMOVDQU Y2, (DX)
+ VMOVDQU Y3, 32(DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_1x1_64_end:
+ RET
+
+// func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 4 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+
+mulGFNI_1x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (CX), Z1
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z1, Z1
+
+ // Store 1 outputs
+ VMOVDQU64 Z1, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x1_64_loop
+ VZEROUPPER
+
+mulGFNI_1x1_64_end:
+ RET
+
+// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 4 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x1_end
+ VBROADCASTSD (CX), Y0
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+
+mulAvxGFNI_1x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (CX), Y1
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y1, Y1
+
+ // Store 1 outputs
+ VMOVDQU Y1, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x1_end:
+ RET
+
+// func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 4 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+
+mulGFNI_1x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (DX), Z1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (CX), Z2
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z2, Z2
+ VXORPD Z1, Z2, Z1
+
+ // Store 1 outputs
+ VMOVDQU64 Z1, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 4 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x1Xor_end
+ VBROADCASTSD (CX), Y0
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+
+mulAvxGFNI_1x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (DX), Y1
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (CX), Y2
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y2, Y2
+ VXORPD Y1, Y2, Y1
+
+ // Store 1 outputs
+ VMOVDQU Y1, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x1Xor_end:
+ RET
+
+// func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x1_64Xor_end
+ VMOVDQU (CX), Y0
+ VMOVDQU 32(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_1x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (DX), Y2
+ VMOVDQU 32(DX), Y3
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y7
+ ADDQ $0x40, CX
+ VPSRLQ $0x04, Y5, Y6
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y5, Y5
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y6, Y6
+ VPAND Y4, Y8, Y8
+ VPSHUFB Y5, Y0, Y5
+ VPSHUFB Y7, Y0, Y7
+ VPSHUFB Y6, Y1, Y6
+ VPSHUFB Y8, Y1, Y8
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 1 outputs
+ VMOVDQU Y2, (DX)
+ VMOVDQU Y3, 32(DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x2_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ MOVQ $0x0000000f, DI
+ MOVQ DI, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_1x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y7
+ VMOVDQU 32(DX), Y9
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y8, Y8
+ VPAND Y4, Y10, Y10
+ VMOVDQU (CX), Y2
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y9, Y2, Y3
+ VPSHUFB Y7, Y2, Y2
+ VPSHUFB Y10, Y6, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y2, Y6, Y0
+ VPXOR Y3, Y5, Y1
+ VMOVDQU 64(CX), Y2
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y9, Y2, Y3
+ VPSHUFB Y7, Y2, Y2
+ VPSHUFB Y10, Y6, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y2, Y6, Y2
+ VPXOR Y3, Y5, Y3
+
+ // Store 2 outputs
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y2, (BX)
+ VMOVDQU Y3, 32(BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_1x2_64_end:
+ RET
+
+// func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+ ADDQ SI, DX
+
+ // Add start offset to input
+ ADDQ SI, CX
+
+mulGFNI_1x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (CX), Z3
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z2
+ VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
+
+ // Store 2 outputs
+ VMOVDQU64 Z2, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z3, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x2_64_loop
+ VZEROUPPER
+
+mulGFNI_1x2_64_end:
+ RET
+
+// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+ ADDQ SI, DX
+
+ // Add start offset to input
+ ADDQ SI, CX
+
+mulAvxGFNI_1x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (CX), Y3
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y3, Y2
+ VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
+
+ // Store 2 outputs
+ VMOVDQU Y2, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y3, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x2_end:
+ RET
+
+// func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+ ADDQ SI, DX
+
+ // Add start offset to input
+ ADDQ SI, CX
+
+mulGFNI_1x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (BX), Z2
+ VMOVDQU64 (DX), Z3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (CX), Z4
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VXORPD Z2, Z5, Z2
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z5
+ VXORPD Z3, Z5, Z3
+
+ // Store 2 outputs
+ VMOVDQU64 Z2, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z3, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+ ADDQ SI, DX
+
+ // Add start offset to input
+ ADDQ SI, CX
+
+mulAvxGFNI_1x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (BX), Y2
+ VMOVDQU (DX), Y3
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (CX), Y4
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y4, Y5
+ VXORPD Y2, Y5, Y2
+ VGF2P8AFFINEQB $0x00, Y1, Y4, Y5
+ VXORPD Y3, Y5, Y3
+
+ // Store 2 outputs
+ VMOVDQU Y2, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y3, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x2Xor_end:
+ RET
+
+// func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ MOVQ $0x0000000f, DI
+ MOVQ DI, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_1x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (SI), Y0
+ VMOVDQU 32(SI), Y1
+ VMOVDQU (BX), Y2
+ VMOVDQU 32(BX), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y2, (BX)
+ VMOVDQU Y3, 32(BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x3_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x3_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ MOVQ $0x0000000f, R8
+ MOVQ R8, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_1x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y10, Y10
+ VPAND Y6, Y12, Y12
+ VMOVDQU (CX), Y4
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y11, Y4, Y5
+ VPSHUFB Y9, Y4, Y4
+ VPSHUFB Y12, Y8, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y4, Y8, Y0
+ VPXOR Y5, Y7, Y1
+ VMOVDQU 64(CX), Y4
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y11, Y4, Y5
+ VPSHUFB Y9, Y4, Y4
+ VPSHUFB Y12, Y8, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y4, Y8, Y2
+ VPXOR Y5, Y7, Y3
+ VMOVDQU 128(CX), Y4
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y11, Y4, Y5
+ VPSHUFB Y9, Y4, Y4
+ VPSHUFB Y12, Y8, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y4, Y8, Y4
+ VPXOR Y5, Y7, Y5
+
+ // Store 3 outputs
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y2, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y4, (BX)
+ VMOVDQU Y5, 32(BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_1x3_64_end:
+ RET
+
+// func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, BX
+ ADDQ DI, SI
+ ADDQ DI, DX
+
+ // Add start offset to input
+ ADDQ DI, CX
+
+mulGFNI_1x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (CX), Z5
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z3
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z4
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
+
+ // Store 3 outputs
+ VMOVDQU64 Z3, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z4, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z5, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x3_64_loop
+ VZEROUPPER
+
+mulGFNI_1x3_64_end:
+ RET
+
+// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x3(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, BX
+ ADDQ DI, SI
+ ADDQ DI, DX
+
+ // Add start offset to input
+ ADDQ DI, CX
+
+mulAvxGFNI_1x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (CX), Y5
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y5, Y3
+ VGF2P8AFFINEQB $0x00, Y1, Y5, Y4
+ VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
+
+ // Store 3 outputs
+ VMOVDQU Y3, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x3_end:
+ RET
+
+// func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, BX
+ ADDQ DI, SI
+ ADDQ DI, DX
+
+ // Add start offset to input
+ ADDQ DI, CX
+
+mulGFNI_1x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (BX), Z3
+ VMOVDQU64 (SI), Z4
+ VMOVDQU64 (DX), Z5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z7
+ VXORPD Z3, Z7, Z3
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z5, Z7, Z5
+
+ // Store 3 outputs
+ VMOVDQU64 Z3, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z4, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z5, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, BX
+ ADDQ DI, SI
+ ADDQ DI, DX
+
+ // Add start offset to input
+ ADDQ DI, CX
+
+mulAvxGFNI_1x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (BX), Y3
+ VMOVDQU (SI), Y4
+ VMOVDQU (DX), Y5
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y7
+ VXORPD Y3, Y7, Y3
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y7
+ VXORPD Y4, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
+ VXORPD Y5, Y7, Y5
+
+ // Store 3 outputs
+ VMOVDQU Y3, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x3Xor_end:
+ RET
+
+// func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x3_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ MOVQ $0x0000000f, R8
+ MOVQ R8, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_1x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (SI), Y0
+ VMOVDQU 32(SI), Y1
+ VMOVDQU (DI), Y2
+ VMOVDQU 32(DI), Y3
+ VMOVDQU (BX), Y4
+ VMOVDQU 32(BX), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y2, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y4, (BX)
+ VMOVDQU Y5, 32(BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x4(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x4_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R9
+ MOVQ R9, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_1x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y6
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPAND Y4, Y6, Y6
+ VPAND Y4, Y7, Y7
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y7, Y5, Y5
+ VPXOR Y3, Y5, Y0
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y7, Y5, Y5
+ VPXOR Y3, Y5, Y1
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y7, Y5, Y5
+ VPXOR Y3, Y5, Y2
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y7, Y5, Y5
+ VPXOR Y3, Y5, Y3
+
+ // Store 4 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x4_loop
+ VZEROUPPER
+
+mulAvxTwo_1x4_end:
+ RET
+
+// func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, DX
+
+ // Add start offset to input
+ ADDQ R8, CX
+
+mulGFNI_1x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (CX), Z7
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z1, Z7, Z5
+ VGF2P8AFFINEQB $0x00, Z2, Z7, Z6
+ VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
+
+ // Store 4 outputs
+ VMOVDQU64 Z4, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z5, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z6, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z7, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x4_64_loop
+ VZEROUPPER
+
+mulGFNI_1x4_64_end:
+ RET
+
+// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x4(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, DX
+
+ // Add start offset to input
+ ADDQ R8, CX
+
+mulAvxGFNI_1x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (CX), Y7
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y7, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y7, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
+
+ // Store 4 outputs
+ VMOVDQU Y4, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x4_end:
+ RET
+
+// func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, DX
+
+ // Add start offset to input
+ ADDQ R8, CX
+
+mulGFNI_1x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (BX), Z4
+ VMOVDQU64 (SI), Z5
+ VMOVDQU64 (DI), Z6
+ VMOVDQU64 (DX), Z7
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z9
+ VXORPD Z4, Z9, Z4
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z9
+ VXORPD Z5, Z9, Z5
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Store 4 outputs
+ VMOVDQU64 Z4, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z5, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z6, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z7, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, DX
+
+ // Add start offset to input
+ ADDQ R8, CX
+
+mulAvxGFNI_1x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (BX), Y4
+ VMOVDQU (SI), Y5
+ VMOVDQU (DI), Y6
+ VMOVDQU (DX), Y7
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y9
+ VXORPD Y4, Y9, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y9
+ VXORPD Y5, Y9, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Store 4 outputs
+ VMOVDQU Y4, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x4Xor_end:
+ RET
+
+// func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x4Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R9
+ MOVQ R9, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_1x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (SI), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU (DI), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU (R8), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (BX), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x4Xor_end:
+ RET
+
+// func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x5(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x5_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), BX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, BX
+
+ // Add start offset to input
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_1x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y5, Y7, Y7
+ VPAND Y5, Y8, Y8
+ VMOVDQU (CX), Y4
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y4, Y4
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y4, Y6, Y0
+ VMOVDQU 64(CX), Y4
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y4, Y4
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y4, Y6, Y1
+ VMOVDQU 128(CX), Y4
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y4, Y4
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y4, Y6, Y2
+ VMOVDQU 192(CX), Y4
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y4, Y4
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y4, Y6, Y3
+ VMOVDQU 256(CX), Y4
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y4, Y4
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y4, Y6, Y4
+
+ // Store 5 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x5_loop
+ VZEROUPPER
+
+mulAvxTwo_1x5_end:
+ RET
+
+// func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, DX
+
+ // Add start offset to input
+ ADDQ R9, CX
+
+mulGFNI_1x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z5
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z7
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z8
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
+
+ // Store 5 outputs
+ VMOVDQU64 Z5, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z6, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z7, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z8, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z9, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x5_64_loop
+ VZEROUPPER
+
+mulGFNI_1x5_64_end:
+ RET
+
+// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x5(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, DX
+
+ // Add start offset to input
+ ADDQ R9, CX
+
+mulAvxGFNI_1x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
+
+ // Store 5 outputs
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x5_end:
+ RET
+
+// func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, DX
+
+ // Add start offset to input
+ ADDQ R9, CX
+
+mulGFNI_1x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (BX), Z5
+ VMOVDQU64 (SI), Z6
+ VMOVDQU64 (DI), Z7
+ VMOVDQU64 (R8), Z8
+ VMOVDQU64 (DX), Z9
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z11
+ VXORPD Z5, Z11, Z5
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z11
+ VXORPD Z6, Z11, Z6
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
+ VXORPD Z7, Z11, Z7
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Store 5 outputs
+ VMOVDQU64 Z5, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z6, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z7, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z8, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z9, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, DX
+
+ // Add start offset to input
+ ADDQ R9, CX
+
+mulAvxGFNI_1x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (BX), Y5
+ VMOVDQU (SI), Y6
+ VMOVDQU (DI), Y7
+ VMOVDQU (R8), Y8
+ VMOVDQU (DX), Y9
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y11
+ VXORPD Y5, Y11, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y11
+ VXORPD Y6, Y11, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
+ VXORPD Y7, Y11, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Store 5 outputs
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x5Xor_end:
+ RET
+
+// func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x5Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), BX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, BX
+
+ // Add start offset to input
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_1x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (SI), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU (DI), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU (R8), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU (R9), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU (BX), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x5Xor_end:
+ RET
+
+// func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x6(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x6_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), BX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, BX
+
+ // Add start offset to input
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_1x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y6, Y8, Y8
+ VPAND Y6, Y9, Y9
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y5, Y5
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y5, Y7, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y5, Y5
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y5, Y7, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y5, Y5
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y5, Y7, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y5, Y5
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y5, Y7, Y3
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y5, Y5
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y5, Y7, Y4
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y5, Y5
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y5, Y7, Y5
+
+ // Store 6 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x6_loop
+ VZEROUPPER
+
+mulAvxTwo_1x6_end:
+ RET
+
+// func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x6_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DX
+
+ // Add start offset to input
+ ADDQ R10, CX
+
+mulGFNI_1x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (CX), Z11
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z11, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z11, Z7
+ VGF2P8AFFINEQB $0x00, Z2, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z3, Z11, Z9
+ VGF2P8AFFINEQB $0x00, Z4, Z11, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
+
+ // Store 6 outputs
+ VMOVDQU64 Z6, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z7, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z8, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z9, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z10, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z11, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x6_64_loop
+ VZEROUPPER
+
+mulGFNI_1x6_64_end:
+ RET
+
+// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x6(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DX
+
+ // Add start offset to input
+ ADDQ R10, CX
+
+mulAvxGFNI_1x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (CX), Y11
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y11, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y11, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y11, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y11, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
+
+ // Store 6 outputs
+ VMOVDQU Y6, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x6_end:
+ RET
+
+// func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x6_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DX
+
+ // Add start offset to input
+ ADDQ R10, CX
+
+mulGFNI_1x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (BX), Z6
+ VMOVDQU64 (SI), Z7
+ VMOVDQU64 (DI), Z8
+ VMOVDQU64 (R8), Z9
+ VMOVDQU64 (R9), Z10
+ VMOVDQU64 (DX), Z11
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
+ VXORPD Z6, Z13, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
+ VXORPD Z7, Z13, Z7
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z8, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 6 outputs
+ VMOVDQU64 Z6, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z7, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z8, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z9, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z10, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z11, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DX
+
+ // Add start offset to input
+ ADDQ R10, CX
+
+mulAvxGFNI_1x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU (SI), Y7
+ VMOVDQU (DI), Y8
+ VMOVDQU (R8), Y9
+ VMOVDQU (R9), Y10
+ VMOVDQU (DX), Y11
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
+ VXORPD Y6, Y13, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
+ VXORPD Y7, Y13, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y8, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 6 outputs
+ VMOVDQU Y6, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x6Xor_end:
+ RET
+
+// func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x6Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), BX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, BX
+
+ // Add start offset to input
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_1x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (SI), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU (DI), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU (R8), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU (R9), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU (R10), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU (BX), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x6Xor_end:
+ RET
+
+// func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x7(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x7_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), BX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, BX
+
+ // Add start offset to input
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_1x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y7, Y9, Y9
+ VPAND Y7, Y10, Y10
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y6, Y6
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y6, Y8, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y6, Y6
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y6, Y8, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y6, Y6
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y6, Y8, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y6, Y6
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y6, Y8, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y6, Y6
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y6, Y8, Y4
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y6, Y6
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y6, Y8, Y5
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y6, Y6
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y6, Y8, Y6
+
+ // Store 7 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y6, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x7_loop
+ VZEROUPPER
+
+mulAvxTwo_1x7_end:
+ RET
+
+// func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x7_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DX
+
+ // Add start offset to input
+ ADDQ R11, CX
+
+mulGFNI_1x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (CX), Z13
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z13, Z7
+ VGF2P8AFFINEQB $0x00, Z1, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z2, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z3, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z4, Z13, Z11
+ VGF2P8AFFINEQB $0x00, Z5, Z13, Z12
+ VGF2P8AFFINEQB $0x00, Z6, Z13, Z13
+
+ // Store 7 outputs
+ VMOVDQU64 Z7, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z8, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z12, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z13, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x7_64_loop
+ VZEROUPPER
+
+mulGFNI_1x7_64_end:
+ RET
+
+// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x7(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DX
+
+ // Add start offset to input
+ ADDQ R11, CX
+
+mulAvxGFNI_1x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (CX), Y13
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y13, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y13, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y13, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y13, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x7_end:
+ RET
+
+// func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x7_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DX
+
+ // Add start offset to input
+ ADDQ R11, CX
+
+mulGFNI_1x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (BX), Z7
+ VMOVDQU64 (SI), Z8
+ VMOVDQU64 (DI), Z9
+ VMOVDQU64 (R8), Z10
+ VMOVDQU64 (R9), Z11
+ VMOVDQU64 (R10), Z12
+ VMOVDQU64 (DX), Z13
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (CX), Z14
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z14, Z15
+ VXORPD Z7, Z15, Z7
+ VGF2P8AFFINEQB $0x00, Z1, Z14, Z15
+ VXORPD Z8, Z15, Z8
+ VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
+ VXORPD Z9, Z15, Z9
+ VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
+ VXORPD Z10, Z15, Z10
+ VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
+ VXORPD Z11, Z15, Z11
+ VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Store 7 outputs
+ VMOVDQU64 Z7, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z8, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z12, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z13, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DX
+
+ // Add start offset to input
+ ADDQ R11, CX
+
+mulAvxGFNI_1x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (BX), Y7
+ VMOVDQU (SI), Y8
+ VMOVDQU (DI), Y9
+ VMOVDQU (R8), Y10
+ VMOVDQU (R9), Y11
+ VMOVDQU (R10), Y12
+ VMOVDQU (DX), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (CX), Y14
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x7Xor_end:
+ RET
+
+// func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x7Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), BX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, BX
+
+ // Add start offset to input
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_1x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (SI), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU (DI), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU (R8), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU (R9), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU (R10), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU (R11), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU (BX), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y6, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x7Xor_end:
+ RET
+
+// func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x8(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x8_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_1x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y8, Y10, Y10
+ VPAND Y8, Y11, Y11
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y7, Y7
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y7, Y9, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y7, Y7
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y7, Y9, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y7, Y7
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y7, Y9, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y7, Y7
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y7, Y9, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y7, Y7
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y7, Y9, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y7, Y7
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y7, Y9, Y5
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y7, Y7
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y7, Y9, Y6
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y7, Y7
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y7, Y9, Y7
+
+ // Store 8 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y6, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y7, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x8_loop
+ VZEROUPPER
+
+mulAvxTwo_1x8_end:
+ RET
+
+// func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x8_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DX
+
+ // Add start offset to input
+ ADDQ R12, CX
+
+mulGFNI_1x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z9
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z11
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z13
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z15
+
+ // Store 8 outputs
+ VMOVDQU64 Z8, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z9, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z10, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z11, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z12, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z13, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z14, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z15, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x8_64_loop
+ VZEROUPPER
+
+mulGFNI_1x8_64_end:
+ RET
+
+// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x8(SB), $0-88
+ // Loading 6 of 8 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+
+mulAvxGFNI_1x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y13, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y13, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y13, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
+ VBROADCASTSD 56(CX), Y14
+ VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x8_end:
+ RET
+
+// func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x8_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DX
+
+ // Add start offset to input
+ ADDQ R12, CX
+
+mulGFNI_1x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (BX), Z8
+ VMOVDQU64 (SI), Z9
+ VMOVDQU64 (DI), Z10
+ VMOVDQU64 (R8), Z11
+ VMOVDQU64 (R9), Z12
+ VMOVDQU64 (R10), Z13
+ VMOVDQU64 (R11), Z14
+ VMOVDQU64 (DX), Z15
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
+ VXORPD Z8, Z17, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
+ VXORPD Z9, Z17, Z9
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
+ VXORPD Z10, Z17, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
+ VXORPD Z11, Z17, Z11
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 8 outputs
+ VMOVDQU64 Z8, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z9, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z10, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z11, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z12, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z13, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z14, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z15, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88
+ // Loading 6 of 8 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+
+mulAvxGFNI_1x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU (DI), Y7
+ VMOVDQU (R8), Y8
+ VMOVDQU (R9), Y9
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (BX), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x8Xor_end:
+ RET
+
+// func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x8Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_1x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (SI), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU (DI), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU (R8), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU (R9), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU (R10), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU (R11), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU (R12), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU (BX), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y6, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y7, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x8Xor_end:
+ RET
+
+// func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x9(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x9_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_1x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y9, Y11, Y11
+ VPAND Y9, Y12, Y12
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y8, Y8
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y8, Y10, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y8, Y8
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y8, Y10, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y8, Y8
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y8, Y10, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y8, Y8
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y8, Y10, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y8, Y8
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y8, Y10, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y8, Y8
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y8, Y10, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y8, Y8
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y8, Y10, Y6
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y8, Y8
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y8, Y10, Y7
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y8, Y8
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y8, Y10, Y8
+
+ // Store 9 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y6, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y7, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y8, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x9_loop
+ VZEROUPPER
+
+mulAvxTwo_1x9_end:
+ RET
+
+// func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x9_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DX
+
+ // Add start offset to input
+ ADDQ R13, CX
+
+mulGFNI_1x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (CX), Z17
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z17, Z9
+ VGF2P8AFFINEQB $0x00, Z1, Z17, Z10
+ VGF2P8AFFINEQB $0x00, Z2, Z17, Z11
+ VGF2P8AFFINEQB $0x00, Z3, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z4, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z5, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z6, Z17, Z15
+ VGF2P8AFFINEQB $0x00, Z7, Z17, Z16
+ VGF2P8AFFINEQB $0x00, Z8, Z17, Z17
+
+ // Store 9 outputs
+ VMOVDQU64 Z9, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z10, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z11, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z15, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z16, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z17, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x9_64_loop
+ VZEROUPPER
+
+mulGFNI_1x9_64_end:
+ RET
+
+// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x9(SB), $0-88
+ // Loading 5 of 9 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+
+mulAvxGFNI_1x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y13, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y13, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y13, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y13, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y13, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y13, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
+ VBROADCASTSD 64(CX), Y14
+ VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x9_end:
+ RET
+
+// func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x9_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DX
+
+ // Add start offset to input
+ ADDQ R13, CX
+
+mulGFNI_1x9_64Xor_loop:
+ // Load 9 outputs
+ VMOVDQU64 (BX), Z9
+ VMOVDQU64 (SI), Z10
+ VMOVDQU64 (DI), Z11
+ VMOVDQU64 (R8), Z12
+ VMOVDQU64 (R9), Z13
+ VMOVDQU64 (R10), Z14
+ VMOVDQU64 (R11), Z15
+ VMOVDQU64 (R12), Z16
+ VMOVDQU64 (DX), Z17
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
+ VXORPD Z9, Z19, Z9
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
+ VXORPD Z10, Z19, Z10
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z11, Z19, Z11
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z12, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z13, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z14, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 9 outputs
+ VMOVDQU64 Z9, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z10, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z11, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z15, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z16, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z17, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88
+ // Loading 5 of 9 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+
+mulAvxGFNI_1x9Xor_loop:
+ // Load 9 outputs
+ VMOVDQU (SI), Y5
+ VMOVDQU (DI), Y6
+ VMOVDQU (R8), Y7
+ VMOVDQU (R9), Y8
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (BX), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x9Xor_end:
+ RET
+
+// func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x9Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_1x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (SI), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU (DI), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU (R8), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU (R9), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU (R10), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU (R11), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU (R12), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU (R13), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU (BX), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y6, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y7, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y8, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x9Xor_end:
+ RET
+
+// func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulAvxTwo_1x10(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x10_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_1x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y10, Y12, Y12
+ VPAND Y10, Y13, Y13
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y7
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y8
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y9, Y9
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y9, Y11, Y9
+
+ // Store 10 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y6, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y7, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y8, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y9, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x10_loop
+ VZEROUPPER
+
+mulAvxTwo_1x10_end:
+ RET
+
+// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x10_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DX
+
+ // Add start offset to input
+ ADDQ R14, CX
+
+mulGFNI_1x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (CX), Z19
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z19, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z19, Z11
+ VGF2P8AFFINEQB $0x00, Z2, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z4, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z5, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z6, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z19, Z17
+ VGF2P8AFFINEQB $0x00, Z8, Z19, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z19, Z19
+
+ // Store 10 outputs
+ VMOVDQU64 Z10, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z11, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z12, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z13, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z14, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z15, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z16, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z17, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z18, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z19, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x10_64_loop
+ VZEROUPPER
+
+mulGFNI_1x10_64_end:
+ RET
+
+// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x10(SB), $0-88
+ // Loading 4 of 10 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+
+mulAvxGFNI_1x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y13, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y13, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y13, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y13, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y13, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y13, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y13, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y13, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
+ VBROADCASTSD 72(CX), Y14
+ VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x10_end:
+ RET
+
+// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x10_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DX
+
+ // Add start offset to input
+ ADDQ R14, CX
+
+mulGFNI_1x10_64Xor_loop:
+ // Load 10 outputs
+ VMOVDQU64 (BX), Z10
+ VMOVDQU64 (SI), Z11
+ VMOVDQU64 (DI), Z12
+ VMOVDQU64 (R8), Z13
+ VMOVDQU64 (R9), Z14
+ VMOVDQU64 (R10), Z15
+ VMOVDQU64 (R11), Z16
+ VMOVDQU64 (R12), Z17
+ VMOVDQU64 (R13), Z18
+ VMOVDQU64 (DX), Z19
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
+ VXORPD Z10, Z21, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
+ VXORPD Z11, Z21, Z11
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z12, Z21, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z13, Z21, Z13
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z14, Z21, Z14
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 10 outputs
+ VMOVDQU64 Z10, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z11, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z12, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z13, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z14, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z15, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z16, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z17, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z18, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z19, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88
+ // Loading 4 of 10 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+
+mulAvxGFNI_1x10Xor_loop:
+ // Load 10 outputs
+ VMOVDQU (SI), Y4
+ VMOVDQU (DI), Y5
+ VMOVDQU (R8), Y6
+ VMOVDQU (R9), Y7
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (BX), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x10Xor_end:
+ RET
+
+// func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_1x10Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_1x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (SI), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU (DI), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU (R8), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU (R9), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU (R10), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU (R11), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU (R12), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU (R13), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU (R14), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU (BX), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ VMOVDQU Y0, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y1, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y2, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y5, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y6, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y7, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y8, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y9, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_1x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_1x10Xor_end:
+ RET
+
+// func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x1_64_end
+ VMOVDQU (CX), Y0
+ VMOVDQU 32(CX), Y1
+ VMOVDQU 64(CX), Y2
+ VMOVDQU 96(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+ MOVQ $0x0000000f, SI
+ MOVQ SI, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_2x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y7
+ VMOVDQU 32(DX), Y9
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y7, Y7
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y8, Y8
+ VPAND Y6, Y10, Y10
+ VPSHUFB Y7, Y0, Y7
+ VPSHUFB Y9, Y0, Y9
+ VPSHUFB Y8, Y1, Y8
+ VPSHUFB Y10, Y1, Y10
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y9
+ ADDQ $0x40, CX
+ VPSRLQ $0x04, Y7, Y8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y7, Y7
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y8, Y8
+ VPAND Y6, Y10, Y10
+ VPSHUFB Y7, Y2, Y7
+ VPSHUFB Y9, Y2, Y9
+ VPSHUFB Y8, Y3, Y8
+ VPSHUFB Y10, Y3, Y10
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 1 outputs
+ VMOVDQU Y4, (BX)
+ VMOVDQU Y5, 32(BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_2x1_64_end:
+ RET
+
+// func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 5 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+
+mulGFNI_2x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z3
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z2
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (CX), Z3
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
+ VXORPD Z2, Z3, Z2
+
+ // Store 1 outputs
+ VMOVDQU64 Z2, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x1_64_loop
+ VZEROUPPER
+
+mulGFNI_2x1_64_end:
+ RET
+
+// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 5 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+
+mulAvxGFNI_2x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y3
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y3, Y2
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (CX), Y3
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
+ VXORPD Y2, Y3, Y2
+
+ // Store 1 outputs
+ VMOVDQU Y2, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x1_end:
+ RET
+
+// func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 5 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+
+mulGFNI_2x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (BX), Z2
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z3
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z3
+ VXORPD Z2, Z3, Z2
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (CX), Z3
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
+ VXORPD Z2, Z3, Z2
+
+ // Store 1 outputs
+ VMOVDQU64 Z2, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 5 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+
+mulAvxGFNI_2x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (BX), Y2
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y3
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y3, Y3
+ VXORPD Y2, Y3, Y2
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (CX), Y3
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
+ VXORPD Y2, Y3, Y2
+
+ // Store 1 outputs
+ VMOVDQU Y2, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x1Xor_end:
+ RET
+
+// func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x1_64Xor_end
+ VMOVDQU (CX), Y0
+ VMOVDQU 32(CX), Y1
+ VMOVDQU 64(CX), Y2
+ VMOVDQU 96(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+ MOVQ $0x0000000f, SI
+ MOVQ SI, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_2x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (BX), Y4
+ VMOVDQU 32(BX), Y5
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y7
+ VMOVDQU 32(DX), Y9
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y7, Y7
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y8, Y8
+ VPAND Y6, Y10, Y10
+ VPSHUFB Y7, Y0, Y7
+ VPSHUFB Y9, Y0, Y9
+ VPSHUFB Y8, Y1, Y8
+ VPSHUFB Y10, Y1, Y10
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y9
+ ADDQ $0x40, CX
+ VPSRLQ $0x04, Y7, Y8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y7, Y7
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y8, Y8
+ VPAND Y6, Y10, Y10
+ VPSHUFB Y7, Y2, Y7
+ VPSHUFB Y9, Y2, Y9
+ VPSHUFB Y8, Y3, Y8
+ VPSHUFB Y10, Y3, Y10
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 1 outputs
+ VMOVDQU Y4, (BX)
+ VMOVDQU Y5, 32(BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x2_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 25 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, BX
+ ADDQ R8, DX
+ MOVQ $0x0000000f, R8
+ MOVQ R8, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_2x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VPXOR Y7, Y8, Y3
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y1, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y2, (SI)
+ VMOVDQU Y3, 32(SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_2x2_64_end:
+ RET
+
+// func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, CX
+
+mulGFNI_2x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z6
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z4
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z5
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z3, Z6, Z7
+ VXORPD Z5, Z7, Z5
+
+ // Store 2 outputs
+ VMOVDQU64 Z4, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z5, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x2_64_loop
+ VZEROUPPER
+
+mulGFNI_2x2_64_end:
+ RET
+
+// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, CX
+
+mulAvxGFNI_2x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y6
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y5
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
+ VXORPD Y4, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y3, Y6, Y7
+ VXORPD Y5, Y7, Y5
+
+ // Store 2 outputs
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x2_end:
+ RET
+
+// func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, CX
+
+mulGFNI_2x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (SI), Z4
+ VMOVDQU64 (BX), Z5
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z6
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
+ VXORPD Z5, Z7, Z5
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z3, Z6, Z7
+ VXORPD Z5, Z7, Z5
+
+ // Store 2 outputs
+ VMOVDQU64 Z4, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z5, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, CX
+
+mulAvxGFNI_2x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (SI), Y4
+ VMOVDQU (BX), Y5
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y6
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y7
+ VXORPD Y4, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y7
+ VXORPD Y5, Y7, Y5
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
+ VXORPD Y4, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y3, Y6, Y7
+ VXORPD Y5, Y7, Y5
+
+ // Store 2 outputs
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x2Xor_end:
+ RET
+
+// func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 25 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, BX
+ ADDQ R8, DX
+ MOVQ $0x0000000f, R8
+ MOVQ R8, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_2x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (DI), Y0
+ VMOVDQU 32(DI), Y1
+ VMOVDQU (SI), Y2
+ VMOVDQU 32(SI), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y1, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y2, (SI)
+ VMOVDQU Y3, 32(SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x3_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x3_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R9
+ MOVQ R9, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_2x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y1, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y2, (R8)
+ VMOVDQU Y3, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y4, (SI)
+ VMOVDQU Y5, 32(SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_2x3_64_end:
+ RET
+
+// func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, CX
+
+mulGFNI_2x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z9
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z7
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z8
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z10
+ VXORPD Z6, Z10, Z6
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z10
+ VXORPD Z7, Z10, Z7
+ VGF2P8AFFINEQB $0x00, Z5, Z9, Z10
+ VXORPD Z8, Z10, Z8
+
+ // Store 3 outputs
+ VMOVDQU64 Z6, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z7, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z8, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x3_64_loop
+ VZEROUPPER
+
+mulGFNI_2x3_64_end:
+ RET
+
+// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x3(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, CX
+
+mulAvxGFNI_2x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y8
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y10
+ VXORPD Y6, Y10, Y6
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y10
+ VXORPD Y7, Y10, Y7
+ VGF2P8AFFINEQB $0x00, Y5, Y9, Y10
+ VXORPD Y8, Y10, Y8
+
+ // Store 3 outputs
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x3_end:
+ RET
+
+// func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, CX
+
+mulGFNI_2x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (SI), Z6
+ VMOVDQU64 (DI), Z7
+ VMOVDQU64 (BX), Z8
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z9
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z10
+ VXORPD Z6, Z10, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z10
+ VXORPD Z7, Z10, Z7
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z10
+ VXORPD Z8, Z10, Z8
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z10
+ VXORPD Z6, Z10, Z6
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z10
+ VXORPD Z7, Z10, Z7
+ VGF2P8AFFINEQB $0x00, Z5, Z9, Z10
+ VXORPD Z8, Z10, Z8
+
+ // Store 3 outputs
+ VMOVDQU64 Z6, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z7, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z8, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, CX
+
+mulAvxGFNI_2x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU (DI), Y7
+ VMOVDQU (BX), Y8
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y10
+ VXORPD Y6, Y10, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y10
+ VXORPD Y7, Y10, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y10
+ VXORPD Y8, Y10, Y8
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y10
+ VXORPD Y6, Y10, Y6
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y10
+ VXORPD Y7, Y10, Y7
+ VGF2P8AFFINEQB $0x00, Y5, Y9, Y10
+ VXORPD Y8, Y10, Y8
+
+ // Store 3 outputs
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x3Xor_end:
+ RET
+
+// func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x3_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R9
+ MOVQ R9, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_2x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (DI), Y0
+ VMOVDQU 32(DI), Y1
+ VMOVDQU (R8), Y2
+ VMOVDQU 32(R8), Y3
+ VMOVDQU (SI), Y4
+ VMOVDQU 32(SI), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y1, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y2, (R8)
+ VMOVDQU Y3, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y4, (SI)
+ VMOVDQU Y5, 32(SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x4(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 25 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x4_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), SI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, SI
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_2x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y3
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x4_loop
+ VZEROUPPER
+
+mulAvxTwo_2x4_end:
+ RET
+
+// func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, CX
+
+mulGFNI_2x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z9
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z11
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z8, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 4 outputs
+ VMOVDQU64 Z8, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x4_64_loop
+ VZEROUPPER
+
+mulGFNI_2x4_64_end:
+ RET
+
+// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x4(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, CX
+
+mulAvxGFNI_2x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y11
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y8, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 4 outputs
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x4_end:
+ RET
+
+// func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, CX
+
+mulGFNI_2x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (SI), Z8
+ VMOVDQU64 (DI), Z9
+ VMOVDQU64 (R8), Z10
+ VMOVDQU64 (BX), Z11
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
+ VXORPD Z8, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z8, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 4 outputs
+ VMOVDQU64 Z8, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, CX
+
+mulAvxGFNI_2x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (SI), Y8
+ VMOVDQU (DI), Y9
+ VMOVDQU (R8), Y10
+ VMOVDQU (BX), Y11
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
+ VXORPD Y8, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y8, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 4 outputs
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x4Xor_end:
+ RET
+
+// func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 25 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x4Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), SI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, SI
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_2x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (DI), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU (R8), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU (R9), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (SI), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x4Xor_end:
+ RET
+
+// func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x5_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_2x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y4
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x5_loop
+ VZEROUPPER
+
+mulAvxTwo_2x5_end:
+ RET
+
+// func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), BX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, BX
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, CX
+
+mulGFNI_2x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z15
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z11
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z13
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z14
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
+ VXORPD Z10, Z16, Z10
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
+ VXORPD Z11, Z16, Z11
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Store 5 outputs
+ VMOVDQU64 Z10, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z11, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x5_64_loop
+ VZEROUPPER
+
+mulGFNI_2x5_64_end:
+ RET
+
+// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x5(SB), $0-88
+ // Loading 9 of 10 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, DX
+
+mulAvxGFNI_2x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x5_end:
+ RET
+
+// func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), BX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, BX
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, CX
+
+mulGFNI_2x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (SI), Z10
+ VMOVDQU64 (DI), Z11
+ VMOVDQU64 (R8), Z12
+ VMOVDQU64 (R9), Z13
+ VMOVDQU64 (BX), Z14
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z15
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z16
+ VXORPD Z10, Z16, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z16
+ VXORPD Z11, Z16, Z11
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
+ VXORPD Z10, Z16, Z10
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
+ VXORPD Z11, Z16, Z11
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Store 5 outputs
+ VMOVDQU64 Z10, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z11, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88
+ // Loading 9 of 10 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, DX
+
+mulAvxGFNI_2x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU (R8), Y10
+ VMOVDQU (R9), Y11
+ VMOVDQU (R10), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x5Xor_end:
+ RET
+
+// func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x5Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_2x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (DI), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU (R8), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU (R9), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU (R10), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU (SI), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x5Xor_end:
+ RET
+
+// func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x6(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x6_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_2x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y5
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x6_loop
+ VZEROUPPER
+
+mulAvxTwo_2x6_end:
+ RET
+
+// func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x6_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), BX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, BX
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, CX
+
+mulGFNI_2x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z15
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z17
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z12, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z13, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z14, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 6 outputs
+ VMOVDQU64 Z12, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z13, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z14, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z15, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z16, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z17, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x6_64_loop
+ VZEROUPPER
+
+mulGFNI_2x6_64_end:
+ RET
+
+// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x6(SB), $0-88
+ // Loading 8 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, DX
+
+mulAvxGFNI_2x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x6_end:
+ RET
+
+// func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x6_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), BX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, BX
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, CX
+
+mulGFNI_2x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (SI), Z12
+ VMOVDQU64 (DI), Z13
+ VMOVDQU64 (R8), Z14
+ VMOVDQU64 (R9), Z15
+ VMOVDQU64 (R10), Z16
+ VMOVDQU64 (BX), Z17
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
+ VXORPD Z12, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
+ VXORPD Z13, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z14, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z12, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z13, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z14, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 6 outputs
+ VMOVDQU64 Z12, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z13, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z14, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z15, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z16, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z17, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88
+ // Loading 8 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, DX
+
+mulAvxGFNI_2x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (DI), Y8
+ VMOVDQU (R8), Y9
+ VMOVDQU (R9), Y10
+ VMOVDQU (R10), Y11
+ VMOVDQU (R11), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x6Xor_end:
+ RET
+
+// func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x6Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_2x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (DI), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU (R8), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU (R9), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU (R10), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU (R11), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU (SI), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x6Xor_end:
+ RET
+
+// func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x7(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 40 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x7_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_2x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y6
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x7_loop
+ VZEROUPPER
+
+mulAvxTwo_2x7_end:
+ RET
+
+// func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x7_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), BX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, BX
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, CX
+
+mulGFNI_2x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z21
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z21, Z14
+ VGF2P8AFFINEQB $0x00, Z1, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z2, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z3, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z4, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z5, Z21, Z19
+ VGF2P8AFFINEQB $0x00, Z6, Z21, Z20
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (CX), Z21
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
+ VXORPD Z14, Z22, Z14
+ VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
+ VXORPD Z15, Z22, Z15
+ VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
+ VXORPD Z16, Z22, Z16
+ VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
+ VXORPD Z17, Z22, Z17
+ VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Store 7 outputs
+ VMOVDQU64 Z14, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z15, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z20, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x7_64_loop
+ VZEROUPPER
+
+mulGFNI_2x7_64_end:
+ RET
+
+// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x7(SB), $0-88
+ // Loading 7 of 14 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, DX
+
+mulAvxGFNI_2x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x7_end:
+ RET
+
+// func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x7_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), BX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, BX
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, CX
+
+mulGFNI_2x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (SI), Z14
+ VMOVDQU64 (DI), Z15
+ VMOVDQU64 (R8), Z16
+ VMOVDQU64 (R9), Z17
+ VMOVDQU64 (R10), Z18
+ VMOVDQU64 (R11), Z19
+ VMOVDQU64 (BX), Z20
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z21
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z21, Z22
+ VXORPD Z14, Z22, Z14
+ VGF2P8AFFINEQB $0x00, Z1, Z21, Z22
+ VXORPD Z15, Z22, Z15
+ VGF2P8AFFINEQB $0x00, Z2, Z21, Z22
+ VXORPD Z16, Z22, Z16
+ VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
+ VXORPD Z17, Z22, Z17
+ VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (CX), Z21
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
+ VXORPD Z14, Z22, Z14
+ VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
+ VXORPD Z15, Z22, Z15
+ VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
+ VXORPD Z16, Z22, Z16
+ VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
+ VXORPD Z17, Z22, Z17
+ VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Store 7 outputs
+ VMOVDQU64 Z14, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z15, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z20, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88
+ // Loading 7 of 14 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, DX
+
+mulAvxGFNI_2x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (DI), Y7
+ VMOVDQU (R8), Y8
+ VMOVDQU (R9), Y9
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x7Xor_end:
+ RET
+
+// func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 40 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x7Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_2x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (DI), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU (R8), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU (R9), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU (R10), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU (R11), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU (R12), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU (SI), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x7Xor_end:
+ RET
+
+// func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 45 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x8_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), SI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, SI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_2x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y7
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x8_loop
+ VZEROUPPER
+
+mulAvxTwo_2x8_end:
+ RET
+
+// func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x8_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, CX
+
+mulGFNI_2x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z17
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z19
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z21
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z22
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z23
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z16, Z25, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z17, Z25, Z17
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 8 outputs
+ VMOVDQU64 Z16, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z17, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z18, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z19, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z20, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z21, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z22, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z23, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x8_64_loop
+ VZEROUPPER
+
+mulGFNI_2x8_64_end:
+ RET
+
+// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x8(SB), $0-88
+ // Loading 6 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), SI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, SI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, DX
+
+mulAvxGFNI_2x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x8_end:
+ RET
+
+// func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x8_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, CX
+
+mulGFNI_2x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (SI), Z16
+ VMOVDQU64 (DI), Z17
+ VMOVDQU64 (R8), Z18
+ VMOVDQU64 (R9), Z19
+ VMOVDQU64 (R10), Z20
+ VMOVDQU64 (R11), Z21
+ VMOVDQU64 (R12), Z22
+ VMOVDQU64 (BX), Z23
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
+ VXORPD Z16, Z25, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
+ VXORPD Z17, Z25, Z17
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z16, Z25, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z17, Z25, Z17
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 8 outputs
+ VMOVDQU64 Z16, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z17, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z18, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z19, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z20, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z21, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z22, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z23, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88
+ // Loading 6 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), SI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, SI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, DX
+
+mulAvxGFNI_2x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU (R8), Y7
+ VMOVDQU (R9), Y8
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x8Xor_end:
+ RET
+
+// func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 45 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x8Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), SI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, SI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_2x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (DI), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU (R8), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU (R9), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU (R10), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU (R11), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU (R12), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU (R13), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU (SI), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x8Xor_end:
+ RET
+
+// func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x9_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), SI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, SI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_2x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y0
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y1
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y2
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y3
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y4
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y5
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y6
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y7
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y8
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y7, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x9_loop
+ VZEROUPPER
+
+mulAvxTwo_2x9_end:
+ RET
+
+// func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x9_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, CX
+
+mulGFNI_2x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (DX), Z27
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z27, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z27, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z27, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z27, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z27, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z27, Z23
+ VGF2P8AFFINEQB $0x00, Z6, Z27, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z27, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z27, Z26
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (CX), Z27
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
+ VXORPD Z18, Z28, Z18
+ VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
+ VXORPD Z19, Z28, Z19
+ VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
+ VXORPD Z20, Z28, Z20
+ VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
+ VXORPD Z21, Z28, Z21
+ VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
+ VXORPD Z22, Z28, Z22
+ VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
+ VXORPD Z23, Z28, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Store 9 outputs
+ VMOVDQU64 Z18, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z19, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z20, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z21, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z22, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z23, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z24, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z25, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z26, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x9_64_loop
+ VZEROUPPER
+
+mulGFNI_2x9_64_end:
+ RET
+
+// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x9(SB), $0-88
+ // Loading 5 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), SI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, SI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, DX
+
+mulAvxGFNI_2x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x9_end:
+ RET
+
+// func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x9_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, CX
+
+mulGFNI_2x9_64Xor_loop:
+ // Load 9 outputs
+ VMOVDQU64 (SI), Z18
+ VMOVDQU64 (DI), Z19
+ VMOVDQU64 (R8), Z20
+ VMOVDQU64 (R9), Z21
+ VMOVDQU64 (R10), Z22
+ VMOVDQU64 (R11), Z23
+ VMOVDQU64 (R12), Z24
+ VMOVDQU64 (R13), Z25
+ VMOVDQU64 (BX), Z26
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (DX), Z27
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z27, Z28
+ VXORPD Z18, Z28, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z27, Z28
+ VXORPD Z19, Z28, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z27, Z28
+ VXORPD Z20, Z28, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
+ VXORPD Z21, Z28, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
+ VXORPD Z22, Z28, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
+ VXORPD Z23, Z28, Z23
+ VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (CX), Z27
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
+ VXORPD Z18, Z28, Z18
+ VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
+ VXORPD Z19, Z28, Z19
+ VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
+ VXORPD Z20, Z28, Z20
+ VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
+ VXORPD Z21, Z28, Z21
+ VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
+ VXORPD Z22, Z28, Z22
+ VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
+ VXORPD Z23, Z28, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Store 9 outputs
+ VMOVDQU64 Z18, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z19, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z20, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z21, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z22, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z23, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z24, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z25, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z26, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88
+ // Loading 5 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), SI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, SI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, DX
+
+mulAvxGFNI_2x9Xor_loop:
+ // Load 9 outputs
+ VMOVDQU (DI), Y5
+ VMOVDQU (R8), Y6
+ VMOVDQU (R9), Y7
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x9Xor_end:
+ RET
+
+// func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x9Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), SI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, SI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_2x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (DI), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU (R8), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU (R9), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU (R10), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU (R11), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU (R12), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU (R13), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU (R14), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU (SI), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y7, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x9Xor_end:
+ RET
+
+// func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x10(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 55 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x10_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_2x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y0
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y1
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y2
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y3
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y4
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y5
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y6
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y7
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y8
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y9
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y7, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y8, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y9, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x10_loop
+ VZEROUPPER
+
+mulAvxTwo_2x10_end:
+ RET
+
+// func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x10_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, CX
+
+mulGFNI_2x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ VMOVDQU64 Z20, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z21, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x10_64_loop
+ VZEROUPPER
+
+mulGFNI_2x10_64_end:
+ RET
+
+// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x10(SB), $8-88
+ // Loading 4 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, DX
+
+mulAvxGFNI_2x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x10_end:
+ RET
+
+// func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x10_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, CX
+
+mulGFNI_2x10_64Xor_loop:
+ // Load 10 outputs
+ VMOVDQU64 (SI), Z20
+ VMOVDQU64 (DI), Z21
+ VMOVDQU64 (R8), Z22
+ VMOVDQU64 (R9), Z23
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (BX), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ VMOVDQU64 Z20, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z21, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88
+ // Loading 4 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, DX
+
+mulAvxGFNI_2x10Xor_loop:
+ // Load 10 outputs
+ VMOVDQU (DI), Y4
+ VMOVDQU (R8), Y5
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x10Xor_end:
+ RET
+
+// func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 55 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_2x10Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_2x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (DI), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU (R8), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU (R9), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU (R10), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU (R11), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU (R12), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU (R13), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU (R14), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU (R15), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU (SI), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y7, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y8, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y9, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_2x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_2x10Xor_end:
+ RET
+
+// func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x1_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x1_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DX
+ MOVQ $0x0000000f, R8
+ MOVQ R8, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_3x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ VPXOR Y3, Y4, Y0
+ VPXOR Y5, Y6, Y1
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y1, 32(DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_3x1_64_end:
+ RET
+
+// func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), SI
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, BX
+ ADDQ DI, CX
+
+mulGFNI_3x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z4
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z3
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z4
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (CX), Z4
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z2, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Store 1 outputs
+ VMOVDQU64 Z3, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x1_64_loop
+ VZEROUPPER
+
+mulGFNI_3x1_64_end:
+ RET
+
+// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), SI
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, BX
+ ADDQ DI, CX
+
+mulAvxGFNI_3x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y4
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y4, Y3
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y4
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (CX), Y4
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y2, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Store 1 outputs
+ VMOVDQU Y3, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x1_end:
+ RET
+
+// func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), SI
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, BX
+ ADDQ DI, CX
+
+mulGFNI_3x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (SI), Z3
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z4
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z4
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (CX), Z4
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z2, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Store 1 outputs
+ VMOVDQU64 Z3, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), SI
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, BX
+ ADDQ DI, CX
+
+mulAvxGFNI_3x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (SI), Y3
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y4
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y4
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (CX), Y4
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y2, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Store 1 outputs
+ VMOVDQU Y3, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x1Xor_end:
+ RET
+
+// func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x1_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DX
+ MOVQ $0x0000000f, R8
+ MOVQ R8, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_3x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (DI), Y0
+ VMOVDQU 32(DI), Y1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y1, 32(DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x2_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 33 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R9
+ MOVQ R9, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_3x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VPXOR Y7, Y8, Y3
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R8)
+ VMOVDQU Y1, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y2, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_3x2_64_end:
+ RET
+
+// func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, CX
+
+mulGFNI_3x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z8
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z7
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z8
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z5, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Store 2 outputs
+ VMOVDQU64 Z6, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z7, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x2_64_loop
+ VZEROUPPER
+
+mulGFNI_3x2_64_end:
+ RET
+
+// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, CX
+
+mulAvxGFNI_3x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y7
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y5, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Store 2 outputs
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x2_end:
+ RET
+
+// func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, CX
+
+mulGFNI_3x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (DI), Z6
+ VMOVDQU64 (SI), Z7
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z8
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z8
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z5, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Store 2 outputs
+ VMOVDQU64 Z6, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z7, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, CX
+
+mulAvxGFNI_3x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU (SI), Y7
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y5, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Store 2 outputs
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x2Xor_end:
+ RET
+
+// func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 33 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R9
+ MOVQ R9, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_3x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R8), Y0
+ VMOVDQU 32(R8), Y1
+ VMOVDQU (DI), Y2
+ VMOVDQU 32(DI), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R8)
+ VMOVDQU Y1, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y2, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x3_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x3_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), DI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DI
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_3x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R8)
+ VMOVDQU Y1, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y2, (R9)
+ VMOVDQU Y3, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y4, (DI)
+ VMOVDQU Y5, 32(DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_3x3_64_end:
+ RET
+
+// func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, CX
+
+mulGFNI_3x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z9
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z10
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z11
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z12
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 3 outputs
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x3_64_loop
+ VZEROUPPER
+
+mulGFNI_3x3_64_end:
+ RET
+
+// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x3(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, CX
+
+mulAvxGFNI_3x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y11
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 3 outputs
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x3_end:
+ RET
+
+// func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, CX
+
+mulGFNI_3x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (DI), Z9
+ VMOVDQU64 (R8), Z10
+ VMOVDQU64 (SI), Z11
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z12
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 3 outputs
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, CX
+
+mulAvxGFNI_3x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU (R8), Y10
+ VMOVDQU (SI), Y11
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 3 outputs
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x3Xor_end:
+ RET
+
+// func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x3_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), DI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DI
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_3x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R8), Y0
+ VMOVDQU 32(R8), Y1
+ VMOVDQU (R9), Y2
+ VMOVDQU 32(R9), Y3
+ VMOVDQU (DI), Y4
+ VMOVDQU 32(DI), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R8)
+ VMOVDQU Y1, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y2, (R9)
+ VMOVDQU Y3, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y4, (DI)
+ VMOVDQU Y5, 32(DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 33 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x4_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_3x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y3
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x4_loop
+ VZEROUPPER
+
+mulAvxTwo_3x4_end:
+ RET
+
+// func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), SI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, SI
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, CX
+
+mulGFNI_3x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z16
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z15
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z16
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 4 outputs
+ VMOVDQU64 Z12, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z13, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z14, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z15, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x4_64_loop
+ VZEROUPPER
+
+mulGFNI_3x4_64_end:
+ RET
+
+// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x4(SB), $0-88
+ // Loading 10 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DX
+
+mulAvxGFNI_3x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x4_end:
+ RET
+
+// func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), SI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, SI
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, CX
+
+mulGFNI_3x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (DI), Z12
+ VMOVDQU64 (R8), Z13
+ VMOVDQU64 (R9), Z14
+ VMOVDQU64 (SI), Z15
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z16
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z16
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 4 outputs
+ VMOVDQU64 Z12, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z13, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z14, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z15, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88
+ // Loading 10 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DX
+
+mulAvxGFNI_3x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R8), Y10
+ VMOVDQU (R9), Y11
+ VMOVDQU (R10), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x4Xor_end:
+ RET
+
+// func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 33 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x4Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_3x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (R8), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU (R9), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU (R10), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (DI), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x4Xor_end:
+ RET
+
+// func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 40 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x5_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_3x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y4
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x5_loop
+ VZEROUPPER
+
+mulAvxTwo_3x5_end:
+ RET
+
+// func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, CX
+
+mulGFNI_3x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z15
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z16
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z17
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z18
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z19
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 5 outputs
+ VMOVDQU64 Z15, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x5_64_loop
+ VZEROUPPER
+
+mulGFNI_3x5_64_end:
+ RET
+
+// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x5(SB), $0-88
+ // Loading 9 of 15 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DX
+
+mulAvxGFNI_3x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x5_end:
+ RET
+
+// func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, CX
+
+mulGFNI_3x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (DI), Z15
+ VMOVDQU64 (R8), Z16
+ VMOVDQU64 (R9), Z17
+ VMOVDQU64 (R10), Z18
+ VMOVDQU64 (SI), Z19
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 5 outputs
+ VMOVDQU64 Z15, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88
+ // Loading 9 of 15 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DX
+
+mulAvxGFNI_3x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU (R9), Y10
+ VMOVDQU (R10), Y11
+ VMOVDQU (R11), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x5Xor_end:
+ RET
+
+// func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 40 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x5Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_3x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (R8), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU (R9), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU (R10), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU (R11), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU (DI), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x5Xor_end:
+ RET
+
+// func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x6_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_3x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y5
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x6_loop
+ VZEROUPPER
+
+mulAvxTwo_3x6_end:
+ RET
+
+// func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x6_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, CX
+
+mulGFNI_3x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z23
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 6 outputs
+ VMOVDQU64 Z18, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z19, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z21, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z22, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z23, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x6_64_loop
+ VZEROUPPER
+
+mulGFNI_3x6_64_end:
+ RET
+
+// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x6(SB), $0-88
+ // Loading 8 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DX
+
+mulAvxGFNI_3x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x6_end:
+ RET
+
+// func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x6_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, CX
+
+mulGFNI_3x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (DI), Z18
+ VMOVDQU64 (R8), Z19
+ VMOVDQU64 (R9), Z20
+ VMOVDQU64 (R10), Z21
+ VMOVDQU64 (R11), Z22
+ VMOVDQU64 (SI), Z23
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 6 outputs
+ VMOVDQU64 Z18, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z19, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z21, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z22, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z23, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88
+ // Loading 8 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DX
+
+mulAvxGFNI_3x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R8), Y8
+ VMOVDQU (R9), Y9
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x6Xor_end:
+ RET
+
+// func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x6Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_3x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (R8), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU (R9), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU (R10), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU (R11), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU (R12), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU (DI), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x6Xor_end:
+ RET
+
+// func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x7(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 54 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x7_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), DI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_3x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y6
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x7_loop
+ VZEROUPPER
+
+mulAvxTwo_3x7_end:
+ RET
+
+// func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x7_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, CX
+
+mulGFNI_3x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z28
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z28, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z28, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z28, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z28, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z28, Z27
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (BX), Z28
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (CX), Z28
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Store 7 outputs
+ VMOVDQU64 Z21, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x7_64_loop
+ VZEROUPPER
+
+mulGFNI_3x7_64_end:
+ RET
+
+// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x7(SB), $0-88
+ // Loading 7 of 21 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), DI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DX
+
+mulAvxGFNI_3x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x7_end:
+ RET
+
+// func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x7_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, CX
+
+mulGFNI_3x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (DI), Z21
+ VMOVDQU64 (R8), Z22
+ VMOVDQU64 (R9), Z23
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (SI), Z27
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z28
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (BX), Z28
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (CX), Z28
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Store 7 outputs
+ VMOVDQU64 Z21, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88
+ // Loading 7 of 21 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), DI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DX
+
+mulAvxGFNI_3x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (R8), Y7
+ VMOVDQU (R9), Y8
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x7Xor_end:
+ RET
+
+// func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 54 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x7Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), DI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_3x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (R8), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU (R9), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU (R10), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU (R11), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU (R12), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU (R13), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU (DI), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x7Xor_end:
+ RET
+
+// func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 61 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x8_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_3x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y7
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y6, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x8_loop
+ VZEROUPPER
+
+mulAvxTwo_3x8_end:
+ RET
+
+// func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x8_64(SB), $0-88
+ // Loading 22 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+
+mulGFNI_3x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x8_64_loop
+ VZEROUPPER
+
+mulGFNI_3x8_64_end:
+ RET
+
+// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x8(SB), $0-88
+ // Loading 6 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+
+mulAvxGFNI_3x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x8_end:
+ RET
+
+// func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x8_64Xor(SB), $0-88
+ // Loading 22 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+
+mulGFNI_3x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (R8), Z22
+ VMOVDQU64 (R9), Z23
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (DI), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88
+ // Loading 6 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+
+mulAvxGFNI_3x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU (R9), Y7
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x8Xor_end:
+ RET
+
+// func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 61 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x8Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_3x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (R8), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU (R9), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU (R10), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU (R11), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU (R12), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU (R13), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU (R14), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU (DI), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y6, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x8Xor_end:
+ RET
+
+// func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x9_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_3x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y0
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y1
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y2
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y3
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y4
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y5
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y6
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y7
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y8
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y6, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y7, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x9_loop
+ VZEROUPPER
+
+mulAvxTwo_3x9_end:
+ RET
+
+// func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x9_64(SB), $8-88
+ // Loading 21 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+
+mulGFNI_3x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x9_64_loop
+ VZEROUPPER
+
+mulGFNI_3x9_64_end:
+ RET
+
+// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x9(SB), $8-88
+ // Loading 5 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+
+mulAvxGFNI_3x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x9_end:
+ RET
+
+// func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x9_64Xor(SB), $8-88
+ // Loading 21 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+
+mulGFNI_3x9_64Xor_loop:
+ // Load 9 outputs
+ VMOVDQU64 (R8), Z21
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (DI), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88
+ // Loading 5 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+
+mulAvxGFNI_3x9Xor_loop:
+ // Load 9 outputs
+ VMOVDQU (R8), Y5
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x9Xor_end:
+ RET
+
+// func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x9Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_3x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (R8), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU (R9), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU (R10), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU (R11), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU (R12), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU (R13), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU (R14), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU (R15), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU (DI), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y6, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y7, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_3x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x9Xor_end:
+ RET
+
+// func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 75 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x10_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X10
+ VPBROADCASTB X10, Y10
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_3x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y0
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y1
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y2
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y3
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y4
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y5
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y6
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y7
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y8
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y9
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (AX), Y13
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y7, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y8, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y9, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_3x10_loop
+ VZEROUPPER
+
+mulAvxTwo_3x10_end:
+ RET
+
+// func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x10_64(SB), $8-88
+ // Loading 20 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_3x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ VMOVDQU64 Z20, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_3x10_64_loop
+ VZEROUPPER
+
+mulGFNI_3x10_64_end:
+ RET
+
+// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x10(SB), $8-88
+ // Loading 4 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_3x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_3x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x10_end:
+ RET
+
+// func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x10_64Xor(SB), $8-88
+ // Loading 20 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_3x10_64Xor_loop:
+ // Load 10 outputs
+ VMOVDQU64 (DI), Z20
+ VMOVDQU64 (R8), Z21
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (SI), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ VMOVDQU64 Z20, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_3x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88
+ // Loading 4 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_3x10Xor_loop:
+ // Load 10 outputs
+ VMOVDQU (DI), Y4
+ VMOVDQU (R8), Y5
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_3x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x10Xor_end:
+ RET
+
+// func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 75 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_3x10Xor_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X10
+ VPBROADCASTB X10, Y10
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_3x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (DI), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU (R8), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU (R9), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU (R10), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU (R11), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU (R12), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU (R13), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU (R14), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU (R15), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU (SI), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (AX), Y13
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ VMOVDQU Y0, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y1, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y2, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y5, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y6, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y7, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y8, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y9, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_3x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_3x10Xor_end:
+ RET
+
+// func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x1_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x1_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R9
+ MOVQ R9, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_4x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ VPXOR Y3, Y4, Y0
+ VPXOR Y5, Y6, Y1
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R8)
+ VMOVDQU Y1, 32(R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_4x1_64_end:
+ RET
+
+// func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 7 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, CX
+
+mulGFNI_4x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z5
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z4
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z5
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z5
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (CX), Z5
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z3, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Store 1 outputs
+ VMOVDQU64 Z4, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x1_64_loop
+ VZEROUPPER
+
+mulGFNI_4x1_64_end:
+ RET
+
+// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 7 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, CX
+
+mulAvxGFNI_4x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y5
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y5, Y4
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y5
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y5
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (CX), Y5
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y3, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Store 1 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x1_end:
+ RET
+
+// func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 7 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, CX
+
+mulGFNI_4x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (DI), Z4
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z5
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z5
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z5
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (CX), Z5
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z3, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Store 1 outputs
+ VMOVDQU64 Z4, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 7 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, CX
+
+mulAvxGFNI_4x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (DI), Y4
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y5
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y5
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y5
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (CX), Y5
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y3, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Store 1 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x1Xor_end:
+ RET
+
+// func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x1_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R9
+ MOVQ R9, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_4x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R8), Y0
+ VMOVDQU 32(R8), Y1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R8)
+ VMOVDQU Y1, 32(R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x2_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 41 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_4x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VPXOR Y7, Y8, Y3
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R9)
+ VMOVDQU Y1, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y2, (R8)
+ VMOVDQU Y3, 32(R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_4x2_64_end:
+ RET
+
+// func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, CX
+
+mulGFNI_4x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z10
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z9
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z10
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z10
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z5, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z7, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Store 2 outputs
+ VMOVDQU64 Z8, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x2_64_loop
+ VZEROUPPER
+
+mulGFNI_4x2_64_end:
+ RET
+
+// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, CX
+
+mulAvxGFNI_4x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y9
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y5, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Store 2 outputs
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x2_end:
+ RET
+
+// func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, CX
+
+mulGFNI_4x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R8), Z8
+ VMOVDQU64 (DI), Z9
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z10
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z10
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z10
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z5, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z7, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Store 2 outputs
+ VMOVDQU64 Z8, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, CX
+
+mulAvxGFNI_4x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R8), Y8
+ VMOVDQU (DI), Y9
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y5, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Store 2 outputs
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x2Xor_end:
+ RET
+
+// func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 41 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_4x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R9), Y0
+ VMOVDQU 32(R9), Y1
+ VMOVDQU (R8), Y2
+ VMOVDQU 32(R8), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R9)
+ VMOVDQU Y1, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y2, (R8)
+ VMOVDQU Y3, 32(R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x3_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x3_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_4x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R9)
+ VMOVDQU Y1, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y2, (R10)
+ VMOVDQU Y3, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y4, (R8)
+ VMOVDQU Y5, 32(R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_4x3_64_end:
+ RET
+
+// func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), DI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DI
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, CX
+
+mulGFNI_4x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z15
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z14
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z15
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z15
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z10, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z11, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Store 3 outputs
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x3_64_loop
+ VZEROUPPER
+
+mulGFNI_4x3_64_end:
+ RET
+
+// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x3(SB), $0-88
+ // Loading 11 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, DX
+
+mulAvxGFNI_4x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x3_end:
+ RET
+
+// func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), DI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DI
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, CX
+
+mulGFNI_4x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R8), Z12
+ VMOVDQU64 (R9), Z13
+ VMOVDQU64 (DI), Z14
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z15
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z15
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z15
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z10, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z11, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Store 3 outputs
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88
+ // Loading 11 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, DX
+
+mulAvxGFNI_4x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU (R10), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x3Xor_end:
+ RET
+
+// func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x3_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_4x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R9), Y0
+ VMOVDQU 32(R9), Y1
+ VMOVDQU (R10), Y2
+ VMOVDQU 32(R10), Y3
+ VMOVDQU (R8), Y4
+ VMOVDQU 32(R8), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R9)
+ VMOVDQU Y1, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y2, (R10)
+ VMOVDQU Y3, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y4, (R8)
+ VMOVDQU Y5, 32(R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x4(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 41 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x4_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_4x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y3
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x4_loop
+ VZEROUPPER
+
+mulAvxTwo_4x4_end:
+ RET
+
+// func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, CX
+
+mulGFNI_4x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z17
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z19
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z20
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 4 outputs
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x4_64_loop
+ VZEROUPPER
+
+mulGFNI_4x4_64_end:
+ RET
+
+// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x4(SB), $0-88
+ // Loading 10 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, DX
+
+mulAvxGFNI_4x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x4_end:
+ RET
+
+// func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, CX
+
+mulGFNI_4x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R8), Z16
+ VMOVDQU64 (R9), Z17
+ VMOVDQU64 (R10), Z18
+ VMOVDQU64 (DI), Z19
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z20
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 4 outputs
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88
+ // Loading 10 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, DX
+
+mulAvxGFNI_4x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R9), Y10
+ VMOVDQU (R10), Y11
+ VMOVDQU (R11), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x4Xor_end:
+ RET
+
+// func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 41 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x4Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_4x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (R9), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU (R10), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU (R11), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R8), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x4Xor_end:
+ RET
+
+// func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x5_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_4x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y4
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x5_loop
+ VZEROUPPER
+
+mulAvxTwo_4x5_end:
+ RET
+
+// func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 27 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, CX
+
+mulGFNI_4x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z25
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z25, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z25, Z24
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z25
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z6, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z7, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z8, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z9, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z25
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (CX), Z25
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z15, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z16, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z17, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z18, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z19, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Store 5 outputs
+ VMOVDQU64 Z20, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z21, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z22, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z23, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z24, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x5_64_loop
+ VZEROUPPER
+
+mulGFNI_4x5_64_end:
+ RET
+
+// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x5(SB), $0-88
+ // Loading 9 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 27 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, DX
+
+mulAvxGFNI_4x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x5_end:
+ RET
+
+// func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 27 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, CX
+
+mulGFNI_4x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R8), Z20
+ VMOVDQU64 (R9), Z21
+ VMOVDQU64 (R10), Z22
+ VMOVDQU64 (R11), Z23
+ VMOVDQU64 (DI), Z24
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z25
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z25
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z6, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z7, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z8, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z9, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z25
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (CX), Z25
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z15, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z16, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z17, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z18, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z19, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Store 5 outputs
+ VMOVDQU64 Z20, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z21, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z22, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z23, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z24, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88
+ // Loading 9 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 27 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, DX
+
+mulAvxGFNI_4x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x5Xor_end:
+ RET
+
+// func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x5Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_4x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (R9), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU (R10), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU (R11), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU (R12), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU (R8), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x5Xor_end:
+ RET
+
+// func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 59 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x6_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R8
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R8
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_4x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y5
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x6_loop
+ VZEROUPPER
+
+mulAvxTwo_4x6_end:
+ RET
+
+// func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x6_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, CX
+
+mulGFNI_4x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z25, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z26, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z27, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z28, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x6_64_loop
+ VZEROUPPER
+
+mulGFNI_4x6_64_end:
+ RET
+
+// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x6(SB), $0-88
+ // Loading 8 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R8
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R8
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, DX
+
+mulAvxGFNI_4x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x6_end:
+ RET
+
+// func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x6_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, CX
+
+mulGFNI_4x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (R8), Z24
+ VMOVDQU64 (R9), Z25
+ VMOVDQU64 (R10), Z26
+ VMOVDQU64 (R11), Z27
+ VMOVDQU64 (R12), Z28
+ VMOVDQU64 (DI), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z25, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z26, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z27, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z28, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88
+ // Loading 8 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R8
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R8
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, DX
+
+mulAvxGFNI_4x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R9), Y8
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x6Xor_end:
+ RET
+
+// func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 59 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x6Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R8
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R8
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_4x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (R9), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU (R10), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU (R11), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU (R12), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU (R13), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU (R8), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x6Xor_end:
+ RET
+
+// func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x7_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_4x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y6
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y5, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x7_loop
+ VZEROUPPER
+
+mulAvxTwo_4x7_end:
+ RET
+
+// func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x7_64(SB), $0-88
+ // Loading 23 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+
+mulGFNI_4x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x7_64_loop
+ VZEROUPPER
+
+mulGFNI_4x7_64_end:
+ RET
+
+// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x7(SB), $0-88
+ // Loading 7 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+
+mulAvxGFNI_4x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x7_end:
+ RET
+
+// func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x7_64Xor(SB), $0-88
+ // Loading 23 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+
+mulGFNI_4x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (R9), Z23
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R8), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88
+ // Loading 7 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+
+mulAvxGFNI_4x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (R9), Y7
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x7Xor_end:
+ RET
+
+// func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x7Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_4x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (R9), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU (R10), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU (R11), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU (R12), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU (R13), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU (R14), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU (R8), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y5, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x7Xor_end:
+ RET
+
+// func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 77 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x8_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_4x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y7
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y5, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y6, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x8_loop
+ VZEROUPPER
+
+mulAvxTwo_4x8_end:
+ RET
+
+// func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x8_64(SB), $8-88
+ // Loading 22 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+
+mulGFNI_4x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x8_64_loop
+ VZEROUPPER
+
+mulGFNI_4x8_64_end:
+ RET
+
+// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x8(SB), $8-88
+ // Loading 6 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+
+mulAvxGFNI_4x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x8_end:
+ RET
+
+// func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x8_64Xor(SB), $8-88
+ // Loading 22 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+
+mulGFNI_4x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R8), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88
+ // Loading 6 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+
+mulAvxGFNI_4x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x8Xor_end:
+ RET
+
+// func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 77 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x8Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_4x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (R9), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU (R10), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU (R11), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU (R12), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU (R13), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU (R14), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU (R15), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU (R8), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y5, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y6, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_4x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x8Xor_end:
+ RET
+
+// func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 86 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x9_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X9
+ VPBROADCASTB X9, Y9
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_4x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y0
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y1
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y2
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y3
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y4
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y5
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y6
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y7
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y8
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (AX), Y12
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y6, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y7, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_4x9_loop
+ VZEROUPPER
+
+mulAvxTwo_4x9_end:
+ RET
+
+// func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x9_64(SB), $8-88
+ // Loading 21 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_4x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_4x9_64_loop
+ VZEROUPPER
+
+mulGFNI_4x9_64_end:
+ RET
+
+// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x9(SB), $8-88
+ // Loading 5 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_4x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_4x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x9_end:
+ RET
+
+// func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x9_64Xor(SB), $8-88
+ // Loading 21 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_4x9_64Xor_loop:
+ // Load 9 outputs
+ VMOVDQU64 (R8), Z21
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (DI), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_4x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88
+ // Loading 5 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_4x9Xor_loop:
+ // Load 9 outputs
+ VMOVDQU (R8), Y5
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_4x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x9Xor_end:
+ RET
+
+// func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 86 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x9Xor_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X9
+ VPBROADCASTB X9, Y9
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_4x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (R8), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU (R9), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU (R10), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU (R11), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU (R12), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU (R13), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU (R14), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU (R15), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU (DI), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (AX), Y12
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ VMOVDQU Y0, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y1, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y2, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y4, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y5, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y6, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y7, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_4x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x9Xor_end:
+ RET
+
+// func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 95 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x10_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_4x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y0
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y1
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y2
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y3
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y4
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y5
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y6
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y7
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y8
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y9
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU Y0, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU Y1, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU Y2, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU Y3, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU Y4, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU Y5, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU Y6, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU Y7, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU Y8, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU Y9, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R9
+ DECQ AX
+ JNZ mulAvxTwo_4x10_loop
+ VZEROUPPER
+
+mulAvxTwo_4x10_end:
+ RET
+
+// func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x10_64(SB), $0-88
+ // Loading 20 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+
+mulGFNI_4x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU64 Z20, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU64 Z21, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU64 Z22, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU64 Z23, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU64 Z24, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU64 Z25, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU64 Z26, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU64 Z27, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU64 Z28, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU64 Z29, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R9
+ DECQ AX
+ JNZ mulGFNI_4x10_64_loop
+ VZEROUPPER
+
+mulGFNI_4x10_64_end:
+ RET
+
+// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x10(SB), $0-88
+ // Loading 4 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+
+mulAvxGFNI_4x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU Y4, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU Y5, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU Y6, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU Y7, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU Y8, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU Y9, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU Y10, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU Y11, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU Y12, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU Y13, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R9
+ DECQ AX
+ JNZ mulAvxGFNI_4x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x10_end:
+ RET
+
+// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x10_64Xor(SB), $0-88
+ // Loading 20 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+
+mulGFNI_4x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU64 (R10)(R9*1), Z20
+ MOVQ 24(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z21
+ MOVQ 48(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z22
+ MOVQ 72(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z23
+ MOVQ 96(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z24
+ MOVQ 120(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z25
+ MOVQ 144(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z26
+ MOVQ 168(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z27
+ MOVQ 192(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z28
+ MOVQ 216(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU64 Z20, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU64 Z21, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU64 Z22, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU64 Z23, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU64 Z24, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU64 Z25, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU64 Z26, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU64 Z27, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU64 Z28, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU64 Z29, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R9
+ DECQ AX
+ JNZ mulGFNI_4x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88
+ // Loading 4 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+
+mulAvxGFNI_4x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU (R10)(R9*1), Y4
+ MOVQ 24(R8), R10
+ VMOVDQU (R10)(R9*1), Y5
+ MOVQ 48(R8), R10
+ VMOVDQU (R10)(R9*1), Y6
+ MOVQ 72(R8), R10
+ VMOVDQU (R10)(R9*1), Y7
+ MOVQ 96(R8), R10
+ VMOVDQU (R10)(R9*1), Y8
+ MOVQ 120(R8), R10
+ VMOVDQU (R10)(R9*1), Y9
+ MOVQ 144(R8), R10
+ VMOVDQU (R10)(R9*1), Y10
+ MOVQ 168(R8), R10
+ VMOVDQU (R10)(R9*1), Y11
+ MOVQ 192(R8), R10
+ VMOVDQU (R10)(R9*1), Y12
+ MOVQ 216(R8), R10
+ VMOVDQU (R10)(R9*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU Y4, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU Y5, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU Y6, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU Y7, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU Y8, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU Y9, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU Y10, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU Y11, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU Y12, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU Y13, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R9
+ DECQ AX
+ JNZ mulAvxGFNI_4x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x10Xor_end:
+ RET
+
+// func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 95 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_4x10Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_4x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ MOVQ (R8), R10
+ VMOVDQU (R10)(R9*1), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ MOVQ 24(R8), R10
+ VMOVDQU (R10)(R9*1), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ MOVQ 48(R8), R10
+ VMOVDQU (R10)(R9*1), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ MOVQ 72(R8), R10
+ VMOVDQU (R10)(R9*1), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ MOVQ 96(R8), R10
+ VMOVDQU (R10)(R9*1), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ MOVQ 120(R8), R10
+ VMOVDQU (R10)(R9*1), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ MOVQ 144(R8), R10
+ VMOVDQU (R10)(R9*1), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ MOVQ 168(R8), R10
+ VMOVDQU (R10)(R9*1), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ MOVQ 192(R8), R10
+ VMOVDQU (R10)(R9*1), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ MOVQ 216(R8), R10
+ VMOVDQU (R10)(R9*1), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU Y0, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU Y1, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU Y2, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU Y3, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU Y4, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU Y5, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU Y6, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU Y7, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU Y8, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU Y9, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R9
+ DECQ AX
+ JNZ mulAvxTwo_4x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_4x10Xor_end:
+ RET
+
+// func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x1_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x1_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_5x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ VPXOR Y3, Y4, Y0
+ VPXOR Y5, Y6, Y1
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R9)
+ VMOVDQU Y1, 32(R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_5x1_64_end:
+ RET
+
+// func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, CX
+
+mulGFNI_5x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z6
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z5
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z6
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z6
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z6
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Store 1 outputs
+ VMOVDQU64 Z5, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x1_64_loop
+ VZEROUPPER
+
+mulGFNI_5x1_64_end:
+ RET
+
+// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, CX
+
+mulAvxGFNI_5x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y6
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y5
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y6
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y6
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y6
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Store 1 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x1_end:
+ RET
+
+// func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, CX
+
+mulGFNI_5x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R8), Z5
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z6
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z6
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z6
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z6
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Store 1 outputs
+ VMOVDQU64 Z5, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, CX
+
+mulAvxGFNI_5x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R8), Y5
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y6
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y6
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y6
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y6
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Store 1 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x1Xor_end:
+ RET
+
+// func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x1_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R10
+ MOVQ R10, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_5x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R9), Y0
+ VMOVDQU 32(R9), Y1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R9)
+ VMOVDQU Y1, 32(R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x2_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 49 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_5x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VPXOR Y7, Y8, Y3
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R10)
+ VMOVDQU Y1, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y2, (R9)
+ VMOVDQU Y3, 32(R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_5x2_64_end:
+ RET
+
+// func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, CX
+
+mulGFNI_5x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z11
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z12
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z12
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z12
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z9, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 2 outputs
+ VMOVDQU64 Z10, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z11, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x2_64_loop
+ VZEROUPPER
+
+mulGFNI_5x2_64_end:
+ RET
+
+// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, CX
+
+mulAvxGFNI_5x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y11
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 2 outputs
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x2_end:
+ RET
+
+// func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, CX
+
+mulGFNI_5x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R9), Z10
+ VMOVDQU64 (R8), Z11
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z12
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z12
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z12
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z9, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 2 outputs
+ VMOVDQU64 Z10, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z11, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, CX
+
+mulAvxGFNI_5x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R9), Y10
+ VMOVDQU (R8), Y11
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 2 outputs
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x2Xor_end:
+ RET
+
+// func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 49 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_5x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R10), Y0
+ VMOVDQU 32(R10), Y1
+ VMOVDQU (R9), Y2
+ VMOVDQU 32(R9), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R10)
+ VMOVDQU Y1, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y2, (R9)
+ VMOVDQU Y3, 32(R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x3_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 70 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x3_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_5x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R10)
+ VMOVDQU Y1, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y2, (R11)
+ VMOVDQU Y3, 32(R11)
+ ADDQ $0x40, R11
+ VMOVDQU Y4, (R9)
+ VMOVDQU Y5, 32(R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_5x3_64_end:
+ RET
+
+// func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, CX
+
+mulGFNI_5x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z15
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z16
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z17
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z18
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z18
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z18
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 3 outputs
+ VMOVDQU64 Z15, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z16, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z17, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x3_64_loop
+ VZEROUPPER
+
+mulGFNI_5x3_64_end:
+ RET
+
+// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x3(SB), $0-88
+ // Loading 11 of 15 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, DX
+
+mulAvxGFNI_5x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x3_end:
+ RET
+
+// func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, CX
+
+mulGFNI_5x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R9), Z15
+ VMOVDQU64 (R10), Z16
+ VMOVDQU64 (R8), Z17
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z18
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z18
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z18
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 3 outputs
+ VMOVDQU64 Z15, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z16, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z17, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88
+ // Loading 11 of 15 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, DX
+
+mulAvxGFNI_5x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU (R11), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x3Xor_end:
+ RET
+
+// func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 70 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x3_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_5x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R10), Y0
+ VMOVDQU 32(R10), Y1
+ VMOVDQU (R11), Y2
+ VMOVDQU 32(R11), Y3
+ VMOVDQU (R9), Y4
+ VMOVDQU 32(R9), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R10)
+ VMOVDQU Y1, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y2, (R11)
+ VMOVDQU Y3, 32(R11)
+ ADDQ $0x40, R11
+ VMOVDQU Y4, (R9)
+ VMOVDQU Y5, 32(R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 49 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x4_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_5x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y3
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x4_loop
+ VZEROUPPER
+
+mulAvxTwo_5x4_end:
+ RET
+
+// func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, CX
+
+mulGFNI_5x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z23
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z24
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z24
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 4 outputs
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z21, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z22, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z23, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x4_64_loop
+ VZEROUPPER
+
+mulGFNI_5x4_64_end:
+ RET
+
+// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x4(SB), $0-88
+ // Loading 10 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, DX
+
+mulAvxGFNI_5x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x4_end:
+ RET
+
+// func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, CX
+
+mulGFNI_5x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R9), Z20
+ VMOVDQU64 (R10), Z21
+ VMOVDQU64 (R11), Z22
+ VMOVDQU64 (R8), Z23
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z24
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z24
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 4 outputs
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z21, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z22, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z23, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88
+ // Loading 10 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, DX
+
+mulAvxGFNI_5x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x4Xor_end:
+ RET
+
+// func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 49 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x4Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_5x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (R10), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU (R11), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU (R12), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R9), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x4Xor_end:
+ RET
+
+// func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 60 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x5_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R9
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R9
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_5x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y4
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y4, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x5_loop
+ VZEROUPPER
+
+mulAvxTwo_5x5_end:
+ RET
+
+// func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, CX
+
+mulGFNI_5x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z26, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z27, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z28, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x5_64_loop
+ VZEROUPPER
+
+mulGFNI_5x5_64_end:
+ RET
+
+// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x5(SB), $0-88
+ // Loading 9 of 25 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R9
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R9
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, DX
+
+mulAvxGFNI_5x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x5_end:
+ RET
+
+// func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, CX
+
+mulGFNI_5x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R9), Z25
+ VMOVDQU64 (R10), Z26
+ VMOVDQU64 (R11), Z27
+ VMOVDQU64 (R12), Z28
+ VMOVDQU64 (R8), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z26, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z27, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z28, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88
+ // Loading 9 of 25 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R9
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R9
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, DX
+
+mulAvxGFNI_5x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x5Xor_end:
+ RET
+
+// func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 60 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x5Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R9
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R9
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_5x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (R10), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU (R11), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU (R12), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU (R13), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU (R9), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y4, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x5Xor_end:
+ RET
+
+// func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x6(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 71 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x6_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_5x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y5
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y4, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y5, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x6_loop
+ VZEROUPPER
+
+mulAvxTwo_5x6_end:
+ RET
+
+// func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x6_64(SB), $0-88
+ // Loading 24 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+
+mulGFNI_5x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x6_64_loop
+ VZEROUPPER
+
+mulGFNI_5x6_64_end:
+ RET
+
+// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x6(SB), $0-88
+ // Loading 8 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+
+mulAvxGFNI_5x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x6_end:
+ RET
+
+// func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x6_64Xor(SB), $0-88
+ // Loading 24 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+
+mulGFNI_5x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R9), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88
+ // Loading 8 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+
+mulAvxGFNI_5x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x6Xor_end:
+ RET
+
+// func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 71 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x6Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_5x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (R10), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU (R11), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU (R12), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU (R13), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU (R14), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU (R9), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y4, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y5, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x6Xor_end:
+ RET
+
+// func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x7_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_5x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y6
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y4, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y5, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x7_loop
+ VZEROUPPER
+
+mulAvxTwo_5x7_end:
+ RET
+
+// func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x7_64(SB), $8-88
+ // Loading 23 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+
+mulGFNI_5x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x7_64_loop
+ VZEROUPPER
+
+mulGFNI_5x7_64_end:
+ RET
+
+// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x7(SB), $8-88
+ // Loading 7 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+
+mulAvxGFNI_5x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x7_end:
+ RET
+
+// func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x7_64Xor(SB), $8-88
+ // Loading 23 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+
+mulGFNI_5x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R9), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88
+ // Loading 7 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+
+mulAvxGFNI_5x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x7Xor_end:
+ RET
+
+// func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x7Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_5x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (R10), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU (R11), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU (R12), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU (R13), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU (R14), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU (R15), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU (R9), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y4, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y5, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_5x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x7Xor_end:
+ RET
+
+// func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 93 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x8_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X8
+ VPBROADCASTB X8, Y8
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_5x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y7
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (AX), Y11
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y5, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y6, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_5x8_loop
+ VZEROUPPER
+
+mulAvxTwo_5x8_end:
+ RET
+
+// func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x8_64(SB), $8-88
+ // Loading 22 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_5x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_5x8_64_loop
+ VZEROUPPER
+
+mulGFNI_5x8_64_end:
+ RET
+
+// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x8(SB), $8-88
+ // Loading 6 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_5x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_5x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x8_end:
+ RET
+
+// func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x8_64Xor(SB), $8-88
+ // Loading 22 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_5x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R8), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_5x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88
+ // Loading 6 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_5x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_5x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x8Xor_end:
+ RET
+
+// func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 93 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x8Xor_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X8
+ VPBROADCASTB X8, Y8
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_5x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (R9), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU (R10), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU (R11), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU (R12), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU (R13), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU (R14), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU (R15), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU (R8), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (AX), Y11
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ VMOVDQU Y0, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y1, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y2, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y4, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y5, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y6, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_5x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x8Xor_end:
+ RET
+
+// func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 104 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x9_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_5x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y0
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y1
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y2
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y3
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y4
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y5
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y6
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y7
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y8
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y0, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y1, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y2, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y3, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y4, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxTwo_5x9_loop
+ VZEROUPPER
+
+mulAvxTwo_5x9_end:
+ RET
+
+// func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x9_64(SB), $0-88
+ // Loading 21 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulGFNI_5x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 Z21, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU64 Z22, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU64 Z23, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU64 Z24, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU64 Z25, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU64 Z26, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU64 Z27, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU64 Z28, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU64 Z29, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R10
+ DECQ AX
+ JNZ mulGFNI_5x9_64_loop
+ VZEROUPPER
+
+mulGFNI_5x9_64_end:
+ RET
+
+// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x9(SB), $0-88
+ // Loading 5 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulAvxGFNI_5x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y10, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y11, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y12, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y13, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxGFNI_5x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x9_end:
+ RET
+
+// func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x9_64Xor(SB), $0-88
+ // Loading 21 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulGFNI_5x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 (R11)(R10*1), Z21
+ MOVQ 24(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z22
+ MOVQ 48(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z23
+ MOVQ 72(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z24
+ MOVQ 96(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z25
+ MOVQ 120(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z26
+ MOVQ 144(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z27
+ MOVQ 168(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z28
+ MOVQ 192(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 Z21, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU64 Z22, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU64 Z23, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU64 Z24, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU64 Z25, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU64 Z26, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU64 Z27, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU64 Z28, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU64 Z29, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R10
+ DECQ AX
+ JNZ mulGFNI_5x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88
+ // Loading 5 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulAvxGFNI_5x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU (R11)(R10*1), Y5
+ MOVQ 24(R9), R11
+ VMOVDQU (R11)(R10*1), Y6
+ MOVQ 48(R9), R11
+ VMOVDQU (R11)(R10*1), Y7
+ MOVQ 72(R9), R11
+ VMOVDQU (R11)(R10*1), Y8
+ MOVQ 96(R9), R11
+ VMOVDQU (R11)(R10*1), Y9
+ MOVQ 120(R9), R11
+ VMOVDQU (R11)(R10*1), Y10
+ MOVQ 144(R9), R11
+ VMOVDQU (R11)(R10*1), Y11
+ MOVQ 168(R9), R11
+ VMOVDQU (R11)(R10*1), Y12
+ MOVQ 192(R9), R11
+ VMOVDQU (R11)(R10*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y10, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y11, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y12, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y13, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxGFNI_5x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x9Xor_end:
+ RET
+
+// func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 104 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x9Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_5x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ MOVQ (R9), R11
+ VMOVDQU (R11)(R10*1), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ MOVQ 24(R9), R11
+ VMOVDQU (R11)(R10*1), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ MOVQ 48(R9), R11
+ VMOVDQU (R11)(R10*1), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ MOVQ 72(R9), R11
+ VMOVDQU (R11)(R10*1), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ MOVQ 96(R9), R11
+ VMOVDQU (R11)(R10*1), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ MOVQ 120(R9), R11
+ VMOVDQU (R11)(R10*1), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ MOVQ 144(R9), R11
+ VMOVDQU (R11)(R10*1), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ MOVQ 168(R9), R11
+ VMOVDQU (R11)(R10*1), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ MOVQ 192(R9), R11
+ VMOVDQU (R11)(R10*1), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y0, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y1, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y2, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y3, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y4, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxTwo_5x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x9Xor_end:
+ RET
+
+// func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 115 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x10_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_5x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y0
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y1
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y2
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y3
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y4
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y5
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y6
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y7
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y8
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y9
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y0, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y1, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y2, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y3, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y4, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxTwo_5x10_loop
+ VZEROUPPER
+
+mulAvxTwo_5x10_end:
+ RET
+
+// func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x10_64(SB), $0-88
+ // Loading 20 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulGFNI_5x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 Z20, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU64 Z21, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU64 Z22, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU64 Z23, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU64 Z24, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU64 Z25, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU64 Z26, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU64 Z27, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU64 Z28, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU64 Z29, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R10
+ DECQ AX
+ JNZ mulGFNI_5x10_64_loop
+ VZEROUPPER
+
+mulGFNI_5x10_64_end:
+ RET
+
+// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x10(SB), $0-88
+ // Loading 4 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulAvxGFNI_5x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y4, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y10, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y11, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y12, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU Y13, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxGFNI_5x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x10_end:
+ RET
+
+// func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x10_64Xor(SB), $0-88
+ // Loading 20 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulGFNI_5x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 (R11)(R10*1), Z20
+ MOVQ 24(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z21
+ MOVQ 48(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z22
+ MOVQ 72(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z23
+ MOVQ 96(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z24
+ MOVQ 120(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z25
+ MOVQ 144(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z26
+ MOVQ 168(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z27
+ MOVQ 192(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z28
+ MOVQ 216(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 Z20, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU64 Z21, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU64 Z22, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU64 Z23, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU64 Z24, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU64 Z25, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU64 Z26, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU64 Z27, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU64 Z28, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU64 Z29, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R10
+ DECQ AX
+ JNZ mulGFNI_5x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88
+ // Loading 4 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulAvxGFNI_5x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU (R11)(R10*1), Y4
+ MOVQ 24(R9), R11
+ VMOVDQU (R11)(R10*1), Y5
+ MOVQ 48(R9), R11
+ VMOVDQU (R11)(R10*1), Y6
+ MOVQ 72(R9), R11
+ VMOVDQU (R11)(R10*1), Y7
+ MOVQ 96(R9), R11
+ VMOVDQU (R11)(R10*1), Y8
+ MOVQ 120(R9), R11
+ VMOVDQU (R11)(R10*1), Y9
+ MOVQ 144(R9), R11
+ VMOVDQU (R11)(R10*1), Y10
+ MOVQ 168(R9), R11
+ VMOVDQU (R11)(R10*1), Y11
+ MOVQ 192(R9), R11
+ VMOVDQU (R11)(R10*1), Y12
+ MOVQ 216(R9), R11
+ VMOVDQU (R11)(R10*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y4, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y10, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y11, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y12, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU Y13, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxGFNI_5x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x10Xor_end:
+ RET
+
+// func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 115 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_5x10Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_5x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ MOVQ (R9), R11
+ VMOVDQU (R11)(R10*1), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ MOVQ 24(R9), R11
+ VMOVDQU (R11)(R10*1), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ MOVQ 48(R9), R11
+ VMOVDQU (R11)(R10*1), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ MOVQ 72(R9), R11
+ VMOVDQU (R11)(R10*1), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ MOVQ 96(R9), R11
+ VMOVDQU (R11)(R10*1), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ MOVQ 120(R9), R11
+ VMOVDQU (R11)(R10*1), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ MOVQ 144(R9), R11
+ VMOVDQU (R11)(R10*1), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ MOVQ 168(R9), R11
+ VMOVDQU (R11)(R10*1), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ MOVQ 192(R9), R11
+ VMOVDQU (R11)(R10*1), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ MOVQ 216(R9), R11
+ VMOVDQU (R11)(R10*1), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y0, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y1, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y2, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y3, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y4, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxTwo_5x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_5x10Xor_end:
+ RET
+
+// func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x1_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x1_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_6x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ VPXOR Y3, Y4, Y0
+ VPXOR Y5, Y6, Y1
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R10)
+ VMOVDQU Y1, 32(R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_6x1_64_end:
+ RET
+
+// func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 9 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, CX
+
+mulGFNI_6x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z7
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z7, Z6
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z7
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z7
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z7
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z7
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (CX), Z7
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z5, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Store 1 outputs
+ VMOVDQU64 Z6, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x1_64_loop
+ VZEROUPPER
+
+mulGFNI_6x1_64_end:
+ RET
+
+// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 9 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, CX
+
+mulAvxGFNI_6x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y7, Y6
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (CX), Y7
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y5, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Store 1 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x1_end:
+ RET
+
+// func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 9 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, CX
+
+mulGFNI_6x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R9), Z6
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z7
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z7
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z7
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z7
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z7
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (CX), Z7
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z5, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Store 1 outputs
+ VMOVDQU64 Z6, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 9 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, CX
+
+mulAvxGFNI_6x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R9), Y6
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (CX), Y7
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y5, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Store 1 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x1Xor_end:
+ RET
+
+// func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x1_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R11
+ MOVQ R11, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_6x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R10), Y0
+ VMOVDQU 32(R10), Y1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R10)
+ VMOVDQU Y1, 32(R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x2_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R10
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+ ADDQ R12, R10
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_6x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VPXOR Y7, Y8, Y3
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R11)
+ VMOVDQU Y1, 32(R11)
+ ADDQ $0x40, R11
+ VMOVDQU Y2, (R10)
+ VMOVDQU Y3, 32(R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_6x2_64_end:
+ RET
+
+// func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, CX
+
+mulGFNI_6x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z14
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z14, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z14, Z13
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z14
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z14
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z14
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z14
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z9, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (CX), Z14
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z11, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Store 2 outputs
+ VMOVDQU64 Z12, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x2_64_loop
+ VZEROUPPER
+
+mulGFNI_6x2_64_end:
+ RET
+
+// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, CX
+
+mulAvxGFNI_6x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (CX), Y14
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x2_end:
+ RET
+
+// func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, CX
+
+mulGFNI_6x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R10), Z12
+ VMOVDQU64 (R9), Z13
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z14
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z14
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z14
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z14
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z14
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z9, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (CX), Z14
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z11, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Store 2 outputs
+ VMOVDQU64 Z12, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, CX
+
+mulAvxGFNI_6x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R10), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (CX), Y14
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x2Xor_end:
+ RET
+
+// func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R10
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+ ADDQ R12, R10
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_6x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R11), Y0
+ VMOVDQU 32(R11), Y1
+ VMOVDQU (R10), Y2
+ VMOVDQU 32(R10), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R11)
+ VMOVDQU Y1, 32(R11)
+ ADDQ $0x40, R11
+ VMOVDQU Y2, (R10)
+ VMOVDQU Y3, 32(R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x3_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x3_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_6x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R11)
+ VMOVDQU Y1, 32(R11)
+ ADDQ $0x40, R11
+ VMOVDQU Y2, (R12)
+ VMOVDQU Y3, 32(R12)
+ ADDQ $0x40, R12
+ VMOVDQU Y4, (R10)
+ VMOVDQU Y5, 32(R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_6x3_64_end:
+ RET
+
+// func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, CX
+
+mulGFNI_6x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z21
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z21, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z21, Z20
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z21
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z21
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z21
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z21
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z14, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (CX), Z21
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z15, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z16, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z17, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Store 3 outputs
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x3_64_loop
+ VZEROUPPER
+
+mulGFNI_6x3_64_end:
+ RET
+
+// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x3(SB), $0-88
+ // Loading 11 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, DX
+
+mulAvxGFNI_6x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x3_end:
+ RET
+
+// func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, CX
+
+mulGFNI_6x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R10), Z18
+ VMOVDQU64 (R11), Z19
+ VMOVDQU64 (R9), Z20
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z21
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z21
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z21
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z21
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z21
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z14, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (CX), Z21
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z15, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z16, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z17, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Store 3 outputs
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88
+ // Loading 11 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, DX
+
+mulAvxGFNI_6x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x3Xor_end:
+ RET
+
+// func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x3_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_6x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R11), Y0
+ VMOVDQU 32(R11), Y1
+ VMOVDQU (R12), Y2
+ VMOVDQU 32(R12), Y3
+ VMOVDQU (R10), Y4
+ VMOVDQU 32(R10), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R11)
+ VMOVDQU Y1, 32(R11)
+ ADDQ $0x40, R11
+ VMOVDQU Y2, (R12)
+ VMOVDQU Y3, 32(R12)
+ ADDQ $0x40, R12
+ VMOVDQU Y4, (R10)
+ VMOVDQU Y5, 32(R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x4_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R10
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R10
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_6x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y3
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y1, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y2, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x4_loop
+ VZEROUPPER
+
+mulAvxTwo_6x4_end:
+ RET
+
+// func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, CX
+
+mulGFNI_6x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z28
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z28, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z28, Z27
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z28
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z28
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z28
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R8), Z28
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (CX), Z28
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z21, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z22, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z23, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Store 4 outputs
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x4_64_loop
+ VZEROUPPER
+
+mulGFNI_6x4_64_end:
+ RET
+
+// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x4(SB), $0-88
+ // Loading 10 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R10
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R10
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, DX
+
+mulAvxGFNI_6x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x4_end:
+ RET
+
+// func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, CX
+
+mulGFNI_6x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R9), Z27
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z28
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z28
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z28
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z28
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R8), Z28
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (CX), Z28
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z21, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z22, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z23, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Store 4 outputs
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88
+ // Loading 10 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R10
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R10
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, DX
+
+mulAvxGFNI_6x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x4Xor_end:
+ RET
+
+// func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x4Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R10
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R10
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_6x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (R11), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU (R12), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU (R13), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R10), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y1, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y2, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y3, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x4Xor_end:
+ RET
+
+// func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 70 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x5_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_6x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y4
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y1, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y2, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y3, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x5_loop
+ VZEROUPPER
+
+mulAvxTwo_6x5_end:
+ RET
+
+// func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x5_64(SB), $0-88
+ // Loading 25 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+
+mulGFNI_6x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x5_64_loop
+ VZEROUPPER
+
+mulGFNI_6x5_64_end:
+ RET
+
+// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x5(SB), $0-88
+ // Loading 9 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+
+mulAvxGFNI_6x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x5_end:
+ RET
+
+// func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x5_64Xor(SB), $0-88
+ // Loading 25 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+
+mulGFNI_6x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R10), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88
+ // Loading 9 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+
+mulAvxGFNI_6x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x5Xor_end:
+ RET
+
+// func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 70 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x5Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_6x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (R11), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU (R12), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU (R13), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU (R14), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU (R10), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y1, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y2, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y3, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y4, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x5Xor_end:
+ RET
+
+// func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x6_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_6x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y5
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y1, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y2, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y3, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y4, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y5, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x6_loop
+ VZEROUPPER
+
+mulAvxTwo_6x6_end:
+ RET
+
+// func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x6_64(SB), $8-88
+ // Loading 24 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+
+mulGFNI_6x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x6_64_loop
+ VZEROUPPER
+
+mulGFNI_6x6_64_end:
+ RET
+
+// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x6(SB), $8-88
+ // Loading 8 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+
+mulAvxGFNI_6x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x6_end:
+ RET
+
+// func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x6_64Xor(SB), $8-88
+ // Loading 24 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+
+mulGFNI_6x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R10), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88
+ // Loading 8 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+
+mulAvxGFNI_6x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x6Xor_end:
+ RET
+
+// func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x6Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_6x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (R11), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU (R12), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU (R13), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU (R14), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU (R15), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU (R10), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y1, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y2, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y3, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y4, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y5, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_6x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x6Xor_end:
+ RET
+
+// func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 96 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x7_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X7
+ VPBROADCASTB X7, Y7
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_6x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y6
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (AX), Y10
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y4, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y5, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_6x7_loop
+ VZEROUPPER
+
+mulAvxTwo_6x7_end:
+ RET
+
+// func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x7_64(SB), $8-88
+ // Loading 23 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 51 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_6x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_6x7_64_loop
+ VZEROUPPER
+
+mulGFNI_6x7_64_end:
+ RET
+
+// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x7(SB), $8-88
+ // Loading 7 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 51 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_6x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_6x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x7_end:
+ RET
+
+// func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x7_64Xor(SB), $8-88
+ // Loading 23 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 51 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_6x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R9), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_6x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88
+ // Loading 7 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 51 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_6x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_6x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x7Xor_end:
+ RET
+
+// func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 96 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x7Xor_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X7
+ VPBROADCASTB X7, Y7
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_6x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (R10), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU (R11), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU (R12), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU (R13), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU (R14), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU (R15), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU (R9), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (AX), Y10
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ VMOVDQU Y0, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y1, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y2, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y3, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y4, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y5, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_6x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x7Xor_end:
+ RET
+
+// func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 109 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x8_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_6x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y7
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y0, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y1, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y2, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y3, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxTwo_6x8_loop
+ VZEROUPPER
+
+mulAvxTwo_6x8_end:
+ RET
+
+// func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x8_64(SB), $0-88
+ // Loading 22 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x8_64_loop
+ VZEROUPPER
+
+mulGFNI_6x8_64_end:
+ RET
+
+// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x8(SB), $0-88
+ // Loading 6 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x8_end:
+ RET
+
+// func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x8_64Xor(SB), $0-88
+ // Loading 22 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 (R12)(R11*1), Z22
+ MOVQ 24(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z23
+ MOVQ 48(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z24
+ MOVQ 72(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z25
+ MOVQ 96(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z26
+ MOVQ 120(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z27
+ MOVQ 144(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z28
+ MOVQ 168(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88
+ // Loading 6 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU (R12)(R11*1), Y6
+ MOVQ 24(R10), R12
+ VMOVDQU (R12)(R11*1), Y7
+ MOVQ 48(R10), R12
+ VMOVDQU (R12)(R11*1), Y8
+ MOVQ 72(R10), R12
+ VMOVDQU (R12)(R11*1), Y9
+ MOVQ 96(R10), R12
+ VMOVDQU (R12)(R11*1), Y10
+ MOVQ 120(R10), R12
+ VMOVDQU (R12)(R11*1), Y11
+ MOVQ 144(R10), R12
+ VMOVDQU (R12)(R11*1), Y12
+ MOVQ 168(R10), R12
+ VMOVDQU (R12)(R11*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x8Xor_end:
+ RET
+
+// func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 109 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x8Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_6x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ MOVQ (R10), R12
+ VMOVDQU (R12)(R11*1), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ MOVQ 24(R10), R12
+ VMOVDQU (R12)(R11*1), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ MOVQ 48(R10), R12
+ VMOVDQU (R12)(R11*1), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ MOVQ 72(R10), R12
+ VMOVDQU (R12)(R11*1), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ MOVQ 96(R10), R12
+ VMOVDQU (R12)(R11*1), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ MOVQ 120(R10), R12
+ VMOVDQU (R12)(R11*1), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ MOVQ 144(R10), R12
+ VMOVDQU (R12)(R11*1), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ MOVQ 168(R10), R12
+ VMOVDQU (R12)(R11*1), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y0, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y1, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y2, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y3, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxTwo_6x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x8Xor_end:
+ RET
+
+// func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 122 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x9_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_6x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y0
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y1
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y2
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y3
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y4
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y5
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y6
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y7
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y8
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y0, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y1, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y2, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y3, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxTwo_6x9_loop
+ VZEROUPPER
+
+mulAvxTwo_6x9_end:
+ RET
+
+// func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x9_64(SB), $0-88
+ // Loading 21 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z21, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x9_64_loop
+ VZEROUPPER
+
+mulGFNI_6x9_64_end:
+ RET
+
+// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x9(SB), $0-88
+ // Loading 5 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x9_end:
+ RET
+
+// func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x9_64Xor(SB), $0-88
+ // Loading 21 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 (R12)(R11*1), Z21
+ MOVQ 24(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z22
+ MOVQ 48(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z23
+ MOVQ 72(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z24
+ MOVQ 96(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z25
+ MOVQ 120(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z26
+ MOVQ 144(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z27
+ MOVQ 168(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z28
+ MOVQ 192(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z21, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88
+ // Loading 5 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU (R12)(R11*1), Y5
+ MOVQ 24(R10), R12
+ VMOVDQU (R12)(R11*1), Y6
+ MOVQ 48(R10), R12
+ VMOVDQU (R12)(R11*1), Y7
+ MOVQ 72(R10), R12
+ VMOVDQU (R12)(R11*1), Y8
+ MOVQ 96(R10), R12
+ VMOVDQU (R12)(R11*1), Y9
+ MOVQ 120(R10), R12
+ VMOVDQU (R12)(R11*1), Y10
+ MOVQ 144(R10), R12
+ VMOVDQU (R12)(R11*1), Y11
+ MOVQ 168(R10), R12
+ VMOVDQU (R12)(R11*1), Y12
+ MOVQ 192(R10), R12
+ VMOVDQU (R12)(R11*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x9Xor_end:
+ RET
+
+// func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 122 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x9Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_6x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ MOVQ (R10), R12
+ VMOVDQU (R12)(R11*1), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ MOVQ 24(R10), R12
+ VMOVDQU (R12)(R11*1), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ MOVQ 48(R10), R12
+ VMOVDQU (R12)(R11*1), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ MOVQ 72(R10), R12
+ VMOVDQU (R12)(R11*1), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ MOVQ 96(R10), R12
+ VMOVDQU (R12)(R11*1), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ MOVQ 120(R10), R12
+ VMOVDQU (R12)(R11*1), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ MOVQ 144(R10), R12
+ VMOVDQU (R12)(R11*1), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ MOVQ 168(R10), R12
+ VMOVDQU (R12)(R11*1), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ MOVQ 192(R10), R12
+ VMOVDQU (R12)(R11*1), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y0, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y1, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y2, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y3, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxTwo_6x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x9Xor_end:
+ RET
+
+// func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 135 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x10_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_6x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y0
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y1
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y2
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y3
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y4
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y5
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y6
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y7
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y8
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y9
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y0, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y1, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y2, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y3, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxTwo_6x10_loop
+ VZEROUPPER
+
+mulAvxTwo_6x10_end:
+ RET
+
+// func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x10_64(SB), $0-88
+ // Loading 20 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z20, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z21, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x10_64_loop
+ VZEROUPPER
+
+mulGFNI_6x10_64_end:
+ RET
+
+// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x10(SB), $0-88
+ // Loading 4 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x10_end:
+ RET
+
+// func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x10_64Xor(SB), $0-88
+ // Loading 20 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 (R12)(R11*1), Z20
+ MOVQ 24(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z21
+ MOVQ 48(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z22
+ MOVQ 72(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z23
+ MOVQ 96(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z24
+ MOVQ 120(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z25
+ MOVQ 144(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z26
+ MOVQ 168(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z27
+ MOVQ 192(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z28
+ MOVQ 216(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z20, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z21, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88
+ // Loading 4 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU (R12)(R11*1), Y4
+ MOVQ 24(R10), R12
+ VMOVDQU (R12)(R11*1), Y5
+ MOVQ 48(R10), R12
+ VMOVDQU (R12)(R11*1), Y6
+ MOVQ 72(R10), R12
+ VMOVDQU (R12)(R11*1), Y7
+ MOVQ 96(R10), R12
+ VMOVDQU (R12)(R11*1), Y8
+ MOVQ 120(R10), R12
+ VMOVDQU (R12)(R11*1), Y9
+ MOVQ 144(R10), R12
+ VMOVDQU (R12)(R11*1), Y10
+ MOVQ 168(R10), R12
+ VMOVDQU (R12)(R11*1), Y11
+ MOVQ 192(R10), R12
+ VMOVDQU (R12)(R11*1), Y12
+ MOVQ 216(R10), R12
+ VMOVDQU (R12)(R11*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x10Xor_end:
+ RET
+
+// func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 135 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_6x10Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_6x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ MOVQ (R10), R12
+ VMOVDQU (R12)(R11*1), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ MOVQ 24(R10), R12
+ VMOVDQU (R12)(R11*1), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ MOVQ 48(R10), R12
+ VMOVDQU (R12)(R11*1), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ MOVQ 72(R10), R12
+ VMOVDQU (R12)(R11*1), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ MOVQ 96(R10), R12
+ VMOVDQU (R12)(R11*1), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ MOVQ 120(R10), R12
+ VMOVDQU (R12)(R11*1), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ MOVQ 144(R10), R12
+ VMOVDQU (R12)(R11*1), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ MOVQ 168(R10), R12
+ VMOVDQU (R12)(R11*1), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ MOVQ 192(R10), R12
+ VMOVDQU (R12)(R11*1), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ MOVQ 216(R10), R12
+ VMOVDQU (R12)(R11*1), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y0, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y1, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y2, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y3, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxTwo_6x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_6x10Xor_end:
+ RET
+
+// func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x1_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x1_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_7x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ VPXOR Y3, Y4, Y0
+ VPXOR Y5, Y6, Y1
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (R10), Y6
+ VMOVDQU 32(R10), Y5
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R11)
+ VMOVDQU Y1, 32(R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_7x1_64_end:
+ RET
+
+// func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, CX
+
+mulGFNI_7x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z8
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z7
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z8
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z8
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z8
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z8
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z8
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Store 1 outputs
+ VMOVDQU64 Z7, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x1_64_loop
+ VZEROUPPER
+
+mulGFNI_7x1_64_end:
+ RET
+
+// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, CX
+
+mulAvxGFNI_7x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y7
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Store 1 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x1_end:
+ RET
+
+// func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, CX
+
+mulGFNI_7x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R10), Z7
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z8
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z8
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z8
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z8
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z8
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z8
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Store 1 outputs
+ VMOVDQU64 Z7, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, CX
+
+mulAvxGFNI_7x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R10), Y7
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Store 1 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x1Xor_end:
+ RET
+
+// func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x1_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R12
+ MOVQ R12, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_7x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R11), Y0
+ VMOVDQU 32(R11), Y1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (R10), Y6
+ VMOVDQU 32(R10), Y5
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R11)
+ VMOVDQU Y1, 32(R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x2_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_7x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VPXOR Y7, Y8, Y3
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU 32(R10), Y11
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R12)
+ VMOVDQU Y1, 32(R12)
+ ADDQ $0x40, R12
+ VMOVDQU Y2, (R11)
+ VMOVDQU Y3, 32(R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_7x2_64_end:
+ RET
+
+// func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R10
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+ ADDQ R12, R10
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, CX
+
+mulGFNI_7x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z16
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z14
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z15
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z16
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z16
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z16
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z16
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z16
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z13, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 2 outputs
+ VMOVDQU64 Z14, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z15, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x2_64_loop
+ VZEROUPPER
+
+mulGFNI_7x2_64_end:
+ RET
+
+// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x2(SB), $0-88
+ // Loading 12 of 14 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, DX
+
+mulAvxGFNI_7x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x2_end:
+ RET
+
+// func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R10
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+ ADDQ R12, R10
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, CX
+
+mulGFNI_7x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R11), Z14
+ VMOVDQU64 (R10), Z15
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z16
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z16
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z16
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z16
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z16
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z16
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z13, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 2 outputs
+ VMOVDQU64 Z14, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z15, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88
+ // Loading 12 of 14 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, DX
+
+mulAvxGFNI_7x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R12), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x2Xor_end:
+ RET
+
+// func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_7x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R12), Y0
+ VMOVDQU 32(R12), Y1
+ VMOVDQU (R11), Y2
+ VMOVDQU 32(R11), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU 32(R10), Y11
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R12)
+ VMOVDQU Y1, 32(R12)
+ ADDQ $0x40, R12
+ VMOVDQU Y2, (R11)
+ VMOVDQU Y3, 32(R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x3_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 94 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x3_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_7x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU 32(R10), Y13
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R12)
+ VMOVDQU Y1, 32(R12)
+ ADDQ $0x40, R12
+ VMOVDQU Y2, (R13)
+ VMOVDQU Y3, 32(R13)
+ ADDQ $0x40, R13
+ VMOVDQU Y4, (R11)
+ VMOVDQU Y5, 32(R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_7x3_64_end:
+ RET
+
+// func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, CX
+
+mulGFNI_7x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z23
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z24
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z24
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z24
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z24
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 3 outputs
+ VMOVDQU64 Z21, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z22, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x3_64_loop
+ VZEROUPPER
+
+mulGFNI_7x3_64_end:
+ RET
+
+// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x3(SB), $0-88
+ // Loading 11 of 21 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, DX
+
+mulAvxGFNI_7x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x3_end:
+ RET
+
+// func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, CX
+
+mulGFNI_7x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R11), Z21
+ VMOVDQU64 (R12), Z22
+ VMOVDQU64 (R10), Z23
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z24
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z24
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z24
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z24
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 3 outputs
+ VMOVDQU64 Z21, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z22, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88
+ // Loading 11 of 21 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, DX
+
+mulAvxGFNI_7x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x3Xor_end:
+ RET
+
+// func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 94 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x3_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_7x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R12), Y0
+ VMOVDQU 32(R12), Y1
+ VMOVDQU (R13), Y2
+ VMOVDQU 32(R13), Y3
+ VMOVDQU (R11), Y4
+ VMOVDQU 32(R11), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU 32(R10), Y13
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R12)
+ VMOVDQU Y1, 32(R12)
+ ADDQ $0x40, R12
+ VMOVDQU Y2, (R13)
+ VMOVDQU Y3, 32(R13)
+ ADDQ $0x40, R13
+ VMOVDQU Y4, (R11)
+ VMOVDQU Y5, 32(R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x4_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_7x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y3
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y7
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1536(CX), Y5
+ VMOVDQU 1568(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1600(CX), Y5
+ VMOVDQU 1632(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1664(CX), Y5
+ VMOVDQU 1696(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1728(CX), Y5
+ VMOVDQU 1760(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y1, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y2, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x4_loop
+ VZEROUPPER
+
+mulAvxTwo_7x4_end:
+ RET
+
+// func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x4_64(SB), $0-88
+ // Loading 26 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+
+mulGFNI_7x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x4_64_loop
+ VZEROUPPER
+
+mulGFNI_7x4_64_end:
+ RET
+
+// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x4(SB), $0-88
+ // Loading 10 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+
+mulAvxGFNI_7x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x4_end:
+ RET
+
+// func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x4_64Xor(SB), $0-88
+ // Loading 26 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+
+mulGFNI_7x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R11), Z29
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88
+ // Loading 10 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+
+mulAvxGFNI_7x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x4Xor_end:
+ RET
+
+// func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x4Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_7x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (R12), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU (R13), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU (R14), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R11), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y7
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1536(CX), Y5
+ VMOVDQU 1568(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1600(CX), Y5
+ VMOVDQU 1632(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1664(CX), Y5
+ VMOVDQU 1696(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1728(CX), Y5
+ VMOVDQU 1760(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y1, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y2, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y3, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x4Xor_end:
+ RET
+
+// func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 80 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x5_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_7x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y4
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y8
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1920(CX), Y6
+ VMOVDQU 1952(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1984(CX), Y6
+ VMOVDQU 2016(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2048(CX), Y6
+ VMOVDQU 2080(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2112(CX), Y6
+ VMOVDQU 2144(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2176(CX), Y6
+ VMOVDQU 2208(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y1, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y2, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y3, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x5_loop
+ VZEROUPPER
+
+mulAvxTwo_7x5_end:
+ RET
+
+// func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x5_64(SB), $8-88
+ // Loading 25 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+
+mulGFNI_7x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x5_64_loop
+ VZEROUPPER
+
+mulGFNI_7x5_64_end:
+ RET
+
+// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x5(SB), $8-88
+ // Loading 9 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+
+mulAvxGFNI_7x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x5_end:
+ RET
+
+// func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x5_64Xor(SB), $8-88
+ // Loading 25 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+
+mulGFNI_7x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R11), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88
+ // Loading 9 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+
+mulAvxGFNI_7x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x5Xor_end:
+ RET
+
+// func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 80 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x5Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_7x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (R12), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU (R13), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU (R14), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU (R15), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU (R11), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y8
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1920(CX), Y6
+ VMOVDQU 1952(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1984(CX), Y6
+ VMOVDQU 2016(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2048(CX), Y6
+ VMOVDQU 2080(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2112(CX), Y6
+ VMOVDQU 2144(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2176(CX), Y6
+ VMOVDQU 2208(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y1, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y2, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y3, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_7x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x5Xor_end:
+ RET
+
+// func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 95 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x6_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_7x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y5
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (AX), Y9
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2304(CX), Y7
+ VMOVDQU 2336(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2368(CX), Y7
+ VMOVDQU 2400(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2432(CX), Y7
+ VMOVDQU 2464(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2496(CX), Y7
+ VMOVDQU 2528(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2560(CX), Y7
+ VMOVDQU 2592(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2624(CX), Y7
+ VMOVDQU 2656(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y1, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y2, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y3, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y4, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y5, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_7x6_loop
+ VZEROUPPER
+
+mulAvxTwo_7x6_end:
+ RET
+
+// func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x6_64(SB), $8-88
+ // Loading 24 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_7x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_7x6_64_loop
+ VZEROUPPER
+
+mulGFNI_7x6_64_end:
+ RET
+
+// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x6(SB), $8-88
+ // Loading 8 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_7x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_7x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x6_end:
+ RET
+
+// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x6_64Xor(SB), $8-88
+ // Loading 24 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_7x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R10), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_7x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88
+ // Loading 8 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_7x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_7x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x6Xor_end:
+ RET
+
+// func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 95 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x6Xor_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_7x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (R11), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU (R12), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU (R13), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU (R14), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU (R15), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU (R10), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (AX), Y9
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2304(CX), Y7
+ VMOVDQU 2336(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2368(CX), Y7
+ VMOVDQU 2400(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2432(CX), Y7
+ VMOVDQU 2464(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2496(CX), Y7
+ VMOVDQU 2528(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2560(CX), Y7
+ VMOVDQU 2592(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2624(CX), Y7
+ VMOVDQU 2656(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ VMOVDQU Y0, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y1, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y2, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y3, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y4, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y5, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_7x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x6Xor_end:
+ RET
+
+// func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 110 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x7_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_7x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y6
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2688(CX), Y8
+ VMOVDQU 2720(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2752(CX), Y8
+ VMOVDQU 2784(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2816(CX), Y8
+ VMOVDQU 2848(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2880(CX), Y8
+ VMOVDQU 2912(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2944(CX), Y8
+ VMOVDQU 2976(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3008(CX), Y8
+ VMOVDQU 3040(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3072(CX), Y8
+ VMOVDQU 3104(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y0, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y1, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y2, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y3, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxTwo_7x7_loop
+ VZEROUPPER
+
+mulAvxTwo_7x7_end:
+ RET
+
+// func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x7_64(SB), $0-88
+ // Loading 23 of 49 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x7_64_loop
+ VZEROUPPER
+
+mulGFNI_7x7_64_end:
+ RET
+
+// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x7(SB), $0-88
+ // Loading 7 of 49 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x7_end:
+ RET
+
+// func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x7_64Xor(SB), $0-88
+ // Loading 23 of 49 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x7_64Xor_loop:
+ // Load 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 (R13)(R12*1), Z23
+ MOVQ 24(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z24
+ MOVQ 48(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z25
+ MOVQ 72(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z26
+ MOVQ 96(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z27
+ MOVQ 120(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z28
+ MOVQ 144(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88
+ // Loading 7 of 49 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x7Xor_loop:
+ // Load 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y9
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y10
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y11
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y12
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x7Xor_end:
+ RET
+
+// func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 110 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x7Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_7x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2688(CX), Y8
+ VMOVDQU 2720(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2752(CX), Y8
+ VMOVDQU 2784(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2816(CX), Y8
+ VMOVDQU 2848(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2880(CX), Y8
+ VMOVDQU 2912(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2944(CX), Y8
+ VMOVDQU 2976(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3008(CX), Y8
+ VMOVDQU 3040(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3072(CX), Y8
+ VMOVDQU 3104(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y0, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y1, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y2, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y3, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxTwo_7x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x7Xor_end:
+ RET
+
+// func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 125 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x8_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_7x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y7
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3072(CX), Y9
+ VMOVDQU 3104(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3136(CX), Y9
+ VMOVDQU 3168(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3200(CX), Y9
+ VMOVDQU 3232(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3264(CX), Y9
+ VMOVDQU 3296(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3328(CX), Y9
+ VMOVDQU 3360(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3392(CX), Y9
+ VMOVDQU 3424(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3456(CX), Y9
+ VMOVDQU 3488(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3520(CX), Y9
+ VMOVDQU 3552(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y0, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y1, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y2, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y3, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxTwo_7x8_loop
+ VZEROUPPER
+
+mulAvxTwo_7x8_end:
+ RET
+
+// func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x8_64(SB), $0-88
+ // Loading 22 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 66 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x8_64_loop
+ VZEROUPPER
+
+mulGFNI_7x8_64_end:
+ RET
+
+// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x8(SB), $0-88
+ // Loading 6 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 66 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x8_end:
+ RET
+
+// func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x8_64Xor(SB), $0-88
+ // Loading 22 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 66 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 (R13)(R12*1), Z22
+ MOVQ 24(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z23
+ MOVQ 48(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z24
+ MOVQ 72(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z25
+ MOVQ 96(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z26
+ MOVQ 120(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z27
+ MOVQ 144(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z28
+ MOVQ 168(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88
+ // Loading 6 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 66 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y9
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y10
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y11
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y12
+ MOVQ 168(R11), R13
+ VMOVDQU (R13)(R12*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x8Xor_end:
+ RET
+
+// func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 125 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x8Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_7x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ MOVQ 168(R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3072(CX), Y9
+ VMOVDQU 3104(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3136(CX), Y9
+ VMOVDQU 3168(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3200(CX), Y9
+ VMOVDQU 3232(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3264(CX), Y9
+ VMOVDQU 3296(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3328(CX), Y9
+ VMOVDQU 3360(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3392(CX), Y9
+ VMOVDQU 3424(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3456(CX), Y9
+ VMOVDQU 3488(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3520(CX), Y9
+ VMOVDQU 3552(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y0, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y1, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y2, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y3, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxTwo_7x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x8Xor_end:
+ RET
+
+// func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 140 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x9_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_7x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y0
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y1
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y2
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y3
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y4
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y5
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y6
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y7
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y8
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y12
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 3456(CX), Y10
+ VMOVDQU 3488(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 3520(CX), Y10
+ VMOVDQU 3552(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3584(CX), Y10
+ VMOVDQU 3616(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3648(CX), Y10
+ VMOVDQU 3680(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3712(CX), Y10
+ VMOVDQU 3744(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3776(CX), Y10
+ VMOVDQU 3808(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3840(CX), Y10
+ VMOVDQU 3872(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3904(CX), Y10
+ VMOVDQU 3936(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3968(CX), Y10
+ VMOVDQU 4000(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y0, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y1, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y2, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y3, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxTwo_7x9_loop
+ VZEROUPPER
+
+mulAvxTwo_7x9_end:
+ RET
+
+// func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x9_64(SB), $0-88
+ // Loading 21 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z21, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x9_64_loop
+ VZEROUPPER
+
+mulGFNI_7x9_64_end:
+ RET
+
+// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x9(SB), $0-88
+ // Loading 5 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x9_end:
+ RET
+
+// func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x9_64Xor(SB), $0-88
+ // Loading 21 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 (R13)(R12*1), Z21
+ MOVQ 24(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z22
+ MOVQ 48(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z23
+ MOVQ 72(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z24
+ MOVQ 96(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z25
+ MOVQ 120(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z26
+ MOVQ 144(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z27
+ MOVQ 168(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z28
+ MOVQ 192(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z21, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88
+ // Loading 5 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y5
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y9
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y10
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y11
+ MOVQ 168(R11), R13
+ VMOVDQU (R13)(R12*1), Y12
+ MOVQ 192(R11), R13
+ VMOVDQU (R13)(R12*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x9Xor_end:
+ RET
+
+// func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 140 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x9Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_7x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ MOVQ 168(R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ MOVQ 192(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y12
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 3456(CX), Y10
+ VMOVDQU 3488(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 3520(CX), Y10
+ VMOVDQU 3552(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3584(CX), Y10
+ VMOVDQU 3616(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3648(CX), Y10
+ VMOVDQU 3680(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3712(CX), Y10
+ VMOVDQU 3744(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3776(CX), Y10
+ VMOVDQU 3808(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3840(CX), Y10
+ VMOVDQU 3872(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3904(CX), Y10
+ VMOVDQU 3936(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3968(CX), Y10
+ VMOVDQU 4000(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y0, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y1, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y2, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y3, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxTwo_7x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x9Xor_end:
+ RET
+
+// func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 155 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x10_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_7x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y0
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y1
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y2
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y3
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y4
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y5
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y6
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y7
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y8
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y9
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y13
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3840(CX), Y11
+ VMOVDQU 3872(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3904(CX), Y11
+ VMOVDQU 3936(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3968(CX), Y11
+ VMOVDQU 4000(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4032(CX), Y11
+ VMOVDQU 4064(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4096(CX), Y11
+ VMOVDQU 4128(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4160(CX), Y11
+ VMOVDQU 4192(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4224(CX), Y11
+ VMOVDQU 4256(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4288(CX), Y11
+ VMOVDQU 4320(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4352(CX), Y11
+ VMOVDQU 4384(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 4416(CX), Y11
+ VMOVDQU 4448(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y0, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y1, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y2, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y3, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxTwo_7x10_loop
+ VZEROUPPER
+
+mulAvxTwo_7x10_end:
+ RET
+
+// func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x10_64(SB), $0-88
+ // Loading 20 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z20, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z21, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x10_64_loop
+ VZEROUPPER
+
+mulGFNI_7x10_64_end:
+ RET
+
+// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x10(SB), $0-88
+ // Loading 4 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x10_end:
+ RET
+
+// func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x10_64Xor(SB), $0-88
+ // Loading 20 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 (R13)(R12*1), Z20
+ MOVQ 24(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z21
+ MOVQ 48(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z22
+ MOVQ 72(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z23
+ MOVQ 96(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z24
+ MOVQ 120(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z25
+ MOVQ 144(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z26
+ MOVQ 168(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z27
+ MOVQ 192(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z28
+ MOVQ 216(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z20, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z21, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88
+ // Loading 4 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y4
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y5
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y9
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y10
+ MOVQ 168(R11), R13
+ VMOVDQU (R13)(R12*1), Y11
+ MOVQ 192(R11), R13
+ VMOVDQU (R13)(R12*1), Y12
+ MOVQ 216(R11), R13
+ VMOVDQU (R13)(R12*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x10Xor_end:
+ RET
+
+// func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 155 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_7x10Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_7x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ MOVQ 168(R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ MOVQ 192(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ MOVQ 216(R11), R13
+ VMOVDQU (R13)(R12*1), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y13
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3840(CX), Y11
+ VMOVDQU 3872(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3904(CX), Y11
+ VMOVDQU 3936(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3968(CX), Y11
+ VMOVDQU 4000(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4032(CX), Y11
+ VMOVDQU 4064(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4096(CX), Y11
+ VMOVDQU 4128(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4160(CX), Y11
+ VMOVDQU 4192(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4224(CX), Y11
+ VMOVDQU 4256(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4288(CX), Y11
+ VMOVDQU 4320(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4352(CX), Y11
+ VMOVDQU 4384(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 4416(CX), Y11
+ VMOVDQU 4448(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y0, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y1, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y2, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y3, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxTwo_7x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_7x10Xor_end:
+ RET
+
+// func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x1_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x1_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_8x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ VPXOR Y3, Y4, Y0
+ VPXOR Y5, Y6, Y1
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (R10), Y6
+ VMOVDQU 32(R10), Y5
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU (R11), Y6
+ VMOVDQU 32(R11), Y5
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R12)
+ VMOVDQU Y1, 32(R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_8x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_8x1_64_end:
+ RET
+
+// func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, CX
+
+mulGFNI_8x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z9
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z8
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z9
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z9
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z9
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z9
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z9
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z9
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z7, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Store 1 outputs
+ VMOVDQU64 Z8, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x1_64_loop
+ VZEROUPPER
+
+mulGFNI_8x1_64_end:
+ RET
+
+// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, CX
+
+mulAvxGFNI_8x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y8
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y7, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Store 1 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x1_end:
+ RET
+
+// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, CX
+
+mulGFNI_8x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R11), Z8
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z9
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z9
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z9
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z9
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z9
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z9
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z9
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z7, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Store 1 outputs
+ VMOVDQU64 Z8, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, CX
+
+mulAvxGFNI_8x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R11), Y8
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y7, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Store 1 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x1Xor_end:
+ RET
+
+// func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x1_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R13
+ MOVQ R13, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_8x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R12), Y0
+ VMOVDQU 32(R12), Y1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (R10), Y6
+ VMOVDQU 32(R10), Y5
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU (R11), Y6
+ VMOVDQU 32(R11), Y5
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R12)
+ VMOVDQU Y1, 32(R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_8x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x2_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 73 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_8x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VPXOR Y7, Y8, Y3
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU 32(R10), Y11
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y9
+ VMOVDQU 32(R11), Y11
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R13)
+ VMOVDQU Y1, 32(R13)
+ ADDQ $0x40, R13
+ VMOVDQU Y2, (R12)
+ VMOVDQU Y3, 32(R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_8x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_8x2_64_end:
+ RET
+
+// func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, CX
+
+mulGFNI_8x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z17
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z18
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z18
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z18
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z18
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z18
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z18
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z15, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 2 outputs
+ VMOVDQU64 Z16, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z17, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x2_64_loop
+ VZEROUPPER
+
+mulGFNI_8x2_64_end:
+ RET
+
+// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x2(SB), $0-88
+ // Loading 12 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, DX
+
+mulAvxGFNI_8x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x2_end:
+ RET
+
+// func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, CX
+
+mulGFNI_8x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R12), Z16
+ VMOVDQU64 (R11), Z17
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z18
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z18
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z18
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z18
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z18
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z18
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z15, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 2 outputs
+ VMOVDQU64 Z16, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z17, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88
+ // Loading 12 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, DX
+
+mulAvxGFNI_8x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R13), Y12
+ VMOVDQU (R12), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x2Xor_end:
+ RET
+
+// func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 73 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_8x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R13), Y0
+ VMOVDQU 32(R13), Y1
+ VMOVDQU (R12), Y2
+ VMOVDQU 32(R12), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU 32(R10), Y11
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y9
+ VMOVDQU 32(R11), Y11
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R13)
+ VMOVDQU Y1, 32(R13)
+ ADDQ $0x40, R13
+ VMOVDQU Y2, (R12)
+ VMOVDQU Y3, 32(R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_8x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x3_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 106 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x3_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_8x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU 32(R10), Y13
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y11
+ VMOVDQU 32(R11), Y13
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R13)
+ VMOVDQU Y1, 32(R13)
+ ADDQ $0x40, R13
+ VMOVDQU Y2, (R14)
+ VMOVDQU Y3, 32(R14)
+ ADDQ $0x40, R14
+ VMOVDQU Y4, (R12)
+ VMOVDQU Y5, 32(R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_8x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_8x3_64_end:
+ RET
+
+// func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, CX
+
+mulGFNI_8x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z27
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z27, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z27, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z27, Z26
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z27
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z27
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z27
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z27
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z27
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z27
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (CX), Z27
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z21, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z22, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z23, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Store 3 outputs
+ VMOVDQU64 Z24, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z25, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z26, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x3_64_loop
+ VZEROUPPER
+
+mulGFNI_8x3_64_end:
+ RET
+
+// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x3(SB), $0-88
+ // Loading 11 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, DX
+
+mulAvxGFNI_8x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x3_end:
+ RET
+
+// func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, CX
+
+mulGFNI_8x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R12), Z24
+ VMOVDQU64 (R13), Z25
+ VMOVDQU64 (R11), Z26
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z27
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z27
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z27
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z27
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z27
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z27
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z27
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (CX), Z27
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z21, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z22, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z23, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Store 3 outputs
+ VMOVDQU64 Z24, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z25, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z26, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88
+ // Loading 11 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, DX
+
+mulAvxGFNI_8x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R12), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x3Xor_end:
+ RET
+
+// func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 106 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x3_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_8x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R13), Y0
+ VMOVDQU 32(R13), Y1
+ VMOVDQU (R14), Y2
+ VMOVDQU 32(R14), Y3
+ VMOVDQU (R12), Y4
+ VMOVDQU 32(R12), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU 32(R10), Y13
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y11
+ VMOVDQU 32(R11), Y13
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R13)
+ VMOVDQU Y1, 32(R13)
+ ADDQ $0x40, R13
+ VMOVDQU Y2, (R14)
+ VMOVDQU Y3, 32(R14)
+ ADDQ $0x40, R14
+ VMOVDQU Y4, (R12)
+ VMOVDQU Y5, 32(R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_8x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 73 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x4_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_8x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y3
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y7
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y7
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1536(CX), Y5
+ VMOVDQU 1568(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1600(CX), Y5
+ VMOVDQU 1632(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1664(CX), Y5
+ VMOVDQU 1696(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1728(CX), Y5
+ VMOVDQU 1760(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1792(CX), Y5
+ VMOVDQU 1824(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1856(CX), Y5
+ VMOVDQU 1888(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1920(CX), Y5
+ VMOVDQU 1952(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1984(CX), Y5
+ VMOVDQU 2016(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y1, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y2, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_8x4_loop
+ VZEROUPPER
+
+mulAvxTwo_8x4_end:
+ RET
+
+// func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x4_64(SB), $8-88
+ // Loading 26 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+
+mulGFNI_8x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x4_64_loop
+ VZEROUPPER
+
+mulGFNI_8x4_64_end:
+ RET
+
+// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x4(SB), $8-88
+ // Loading 10 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+
+mulAvxGFNI_8x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x4_end:
+ RET
+
+// func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x4_64Xor(SB), $8-88
+ // Loading 26 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+
+mulGFNI_8x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R12), Z29
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88
+ // Loading 10 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+
+mulAvxGFNI_8x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R12), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x4Xor_end:
+ RET
+
+// func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 73 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x4Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_8x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (R13), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU (R14), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU (R15), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R12), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y7
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y7
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1536(CX), Y5
+ VMOVDQU 1568(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1600(CX), Y5
+ VMOVDQU 1632(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1664(CX), Y5
+ VMOVDQU 1696(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1728(CX), Y5
+ VMOVDQU 1760(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1792(CX), Y5
+ VMOVDQU 1824(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1856(CX), Y5
+ VMOVDQU 1888(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1920(CX), Y5
+ VMOVDQU 1952(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1984(CX), Y5
+ VMOVDQU 2016(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y1, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y2, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_8x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x4Xor_end:
+ RET
+
+// func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x5_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X5
+ VPBROADCASTB X5, Y5
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_8x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y4
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R10), Y8
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1920(CX), Y6
+ VMOVDQU 1952(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1984(CX), Y6
+ VMOVDQU 2016(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2048(CX), Y6
+ VMOVDQU 2080(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2112(CX), Y6
+ VMOVDQU 2144(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2176(CX), Y6
+ VMOVDQU 2208(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (AX), Y8
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2240(CX), Y6
+ VMOVDQU 2272(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2304(CX), Y6
+ VMOVDQU 2336(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2368(CX), Y6
+ VMOVDQU 2400(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2432(CX), Y6
+ VMOVDQU 2464(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2496(CX), Y6
+ VMOVDQU 2528(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y1, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y2, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y3, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_8x5_loop
+ VZEROUPPER
+
+mulAvxTwo_8x5_end:
+ RET
+
+// func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x5_64(SB), $8-88
+ // Loading 25 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_8x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_8x5_64_loop
+ VZEROUPPER
+
+mulGFNI_8x5_64_end:
+ RET
+
+// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x5(SB), $8-88
+ // Loading 9 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_8x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_8x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x5_end:
+ RET
+
+// func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x5_64Xor(SB), $8-88
+ // Loading 25 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_8x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R11), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_8x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88
+ // Loading 9 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_8x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_8x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x5Xor_end:
+ RET
+
+// func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x5Xor_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X5
+ VPBROADCASTB X5, Y5
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_8x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (R12), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU (R13), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU (R14), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU (R15), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU (R11), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R10), Y8
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1920(CX), Y6
+ VMOVDQU 1952(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1984(CX), Y6
+ VMOVDQU 2016(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2048(CX), Y6
+ VMOVDQU 2080(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2112(CX), Y6
+ VMOVDQU 2144(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2176(CX), Y6
+ VMOVDQU 2208(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (AX), Y8
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2240(CX), Y6
+ VMOVDQU 2272(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2304(CX), Y6
+ VMOVDQU 2336(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2368(CX), Y6
+ VMOVDQU 2400(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2432(CX), Y6
+ VMOVDQU 2464(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2496(CX), Y6
+ VMOVDQU 2528(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ VMOVDQU Y0, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y1, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y2, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y3, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y4, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_8x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x5Xor_end:
+ RET
+
+// func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 107 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x6_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_8x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y5
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y9
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2304(CX), Y7
+ VMOVDQU 2336(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2368(CX), Y7
+ VMOVDQU 2400(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2432(CX), Y7
+ VMOVDQU 2464(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2496(CX), Y7
+ VMOVDQU 2528(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2560(CX), Y7
+ VMOVDQU 2592(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2624(CX), Y7
+ VMOVDQU 2656(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2688(CX), Y7
+ VMOVDQU 2720(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2752(CX), Y7
+ VMOVDQU 2784(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2816(CX), Y7
+ VMOVDQU 2848(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2880(CX), Y7
+ VMOVDQU 2912(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2944(CX), Y7
+ VMOVDQU 2976(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3008(CX), Y7
+ VMOVDQU 3040(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x6_loop
+ VZEROUPPER
+
+mulAvxTwo_8x6_end:
+ RET
+
+// func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x6_64(SB), $0-88
+ // Loading 24 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x6_64_loop
+ VZEROUPPER
+
+mulGFNI_8x6_64_end:
+ RET
+
+// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x6(SB), $0-88
+ // Loading 8 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x6_end:
+ RET
+
+// func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x6_64Xor(SB), $0-88
+ // Loading 24 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x6_64Xor_loop:
+ // Load 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88
+ // Loading 8 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x6Xor_loop:
+ // Load 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x6Xor_end:
+ RET
+
+// func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 107 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x6Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_8x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y9
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2304(CX), Y7
+ VMOVDQU 2336(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2368(CX), Y7
+ VMOVDQU 2400(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2432(CX), Y7
+ VMOVDQU 2464(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2496(CX), Y7
+ VMOVDQU 2528(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2560(CX), Y7
+ VMOVDQU 2592(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2624(CX), Y7
+ VMOVDQU 2656(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2688(CX), Y7
+ VMOVDQU 2720(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2752(CX), Y7
+ VMOVDQU 2784(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2816(CX), Y7
+ VMOVDQU 2848(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2880(CX), Y7
+ VMOVDQU 2912(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2944(CX), Y7
+ VMOVDQU 2976(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3008(CX), Y7
+ VMOVDQU 3040(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x6Xor_end:
+ RET
+
+// func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 124 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x7_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_8x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y6
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2688(CX), Y8
+ VMOVDQU 2720(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2752(CX), Y8
+ VMOVDQU 2784(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2816(CX), Y8
+ VMOVDQU 2848(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2880(CX), Y8
+ VMOVDQU 2912(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2944(CX), Y8
+ VMOVDQU 2976(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3008(CX), Y8
+ VMOVDQU 3040(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3072(CX), Y8
+ VMOVDQU 3104(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3136(CX), Y8
+ VMOVDQU 3168(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3200(CX), Y8
+ VMOVDQU 3232(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3264(CX), Y8
+ VMOVDQU 3296(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3328(CX), Y8
+ VMOVDQU 3360(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3392(CX), Y8
+ VMOVDQU 3424(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3456(CX), Y8
+ VMOVDQU 3488(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3520(CX), Y8
+ VMOVDQU 3552(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x7_loop
+ VZEROUPPER
+
+mulAvxTwo_8x7_end:
+ RET
+
+// func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x7_64(SB), $0-88
+ // Loading 23 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x7_64_loop
+ VZEROUPPER
+
+mulGFNI_8x7_64_end:
+ RET
+
+// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x7(SB), $0-88
+ // Loading 7 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x7_end:
+ RET
+
+// func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x7_64Xor(SB), $0-88
+ // Loading 23 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x7_64Xor_loop:
+ // Load 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z23
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 144(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88
+ // Loading 7 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x7Xor_loop:
+ // Load 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x7Xor_end:
+ RET
+
+// func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 124 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x7Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_8x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2688(CX), Y8
+ VMOVDQU 2720(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2752(CX), Y8
+ VMOVDQU 2784(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2816(CX), Y8
+ VMOVDQU 2848(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2880(CX), Y8
+ VMOVDQU 2912(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2944(CX), Y8
+ VMOVDQU 2976(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3008(CX), Y8
+ VMOVDQU 3040(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3072(CX), Y8
+ VMOVDQU 3104(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3136(CX), Y8
+ VMOVDQU 3168(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3200(CX), Y8
+ VMOVDQU 3232(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3264(CX), Y8
+ VMOVDQU 3296(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3328(CX), Y8
+ VMOVDQU 3360(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3392(CX), Y8
+ VMOVDQU 3424(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3456(CX), Y8
+ VMOVDQU 3488(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3520(CX), Y8
+ VMOVDQU 3552(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x7Xor_end:
+ RET
+
+// func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 141 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x8_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_8x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y7
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3072(CX), Y9
+ VMOVDQU 3104(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3136(CX), Y9
+ VMOVDQU 3168(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3200(CX), Y9
+ VMOVDQU 3232(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3264(CX), Y9
+ VMOVDQU 3296(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3328(CX), Y9
+ VMOVDQU 3360(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3392(CX), Y9
+ VMOVDQU 3424(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3456(CX), Y9
+ VMOVDQU 3488(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3520(CX), Y9
+ VMOVDQU 3552(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3584(CX), Y9
+ VMOVDQU 3616(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3648(CX), Y9
+ VMOVDQU 3680(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3712(CX), Y9
+ VMOVDQU 3744(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3776(CX), Y9
+ VMOVDQU 3808(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3840(CX), Y9
+ VMOVDQU 3872(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3904(CX), Y9
+ VMOVDQU 3936(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3968(CX), Y9
+ VMOVDQU 4000(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4032(CX), Y9
+ VMOVDQU 4064(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x8_loop
+ VZEROUPPER
+
+mulAvxTwo_8x8_end:
+ RET
+
+// func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x8_64(SB), $0-88
+ // Loading 22 of 64 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x8_64_loop
+ VZEROUPPER
+
+mulGFNI_8x8_64_end:
+ RET
+
+// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x8(SB), $0-88
+ // Loading 6 of 64 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x8_end:
+ RET
+
+// func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x8_64Xor(SB), $0-88
+ // Loading 22 of 64 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z22
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z23
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 144(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 168(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88
+ // Loading 6 of 64 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 168(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x8Xor_end:
+ RET
+
+// func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 141 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x8Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_8x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ MOVQ 168(R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3072(CX), Y9
+ VMOVDQU 3104(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3136(CX), Y9
+ VMOVDQU 3168(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3200(CX), Y9
+ VMOVDQU 3232(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3264(CX), Y9
+ VMOVDQU 3296(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3328(CX), Y9
+ VMOVDQU 3360(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3392(CX), Y9
+ VMOVDQU 3424(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3456(CX), Y9
+ VMOVDQU 3488(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3520(CX), Y9
+ VMOVDQU 3552(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3584(CX), Y9
+ VMOVDQU 3616(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3648(CX), Y9
+ VMOVDQU 3680(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3712(CX), Y9
+ VMOVDQU 3744(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3776(CX), Y9
+ VMOVDQU 3808(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3840(CX), Y9
+ VMOVDQU 3872(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3904(CX), Y9
+ VMOVDQU 3936(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3968(CX), Y9
+ VMOVDQU 4000(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4032(CX), Y9
+ VMOVDQU 4064(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x8Xor_end:
+ RET
+
+// func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 158 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x9_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_8x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y0
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y1
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y2
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y3
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y4
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y5
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y6
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y7
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y8
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y12
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y12
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 3456(CX), Y10
+ VMOVDQU 3488(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 3520(CX), Y10
+ VMOVDQU 3552(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3584(CX), Y10
+ VMOVDQU 3616(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3648(CX), Y10
+ VMOVDQU 3680(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3712(CX), Y10
+ VMOVDQU 3744(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3776(CX), Y10
+ VMOVDQU 3808(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3840(CX), Y10
+ VMOVDQU 3872(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3904(CX), Y10
+ VMOVDQU 3936(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3968(CX), Y10
+ VMOVDQU 4000(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4032(CX), Y10
+ VMOVDQU 4064(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4096(CX), Y10
+ VMOVDQU 4128(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4160(CX), Y10
+ VMOVDQU 4192(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4224(CX), Y10
+ VMOVDQU 4256(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4288(CX), Y10
+ VMOVDQU 4320(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4352(CX), Y10
+ VMOVDQU 4384(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4416(CX), Y10
+ VMOVDQU 4448(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 4480(CX), Y10
+ VMOVDQU 4512(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 4544(CX), Y10
+ VMOVDQU 4576(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x9_loop
+ VZEROUPPER
+
+mulAvxTwo_8x9_end:
+ RET
+
+// func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x9_64(SB), $0-88
+ // Loading 21 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z21, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x9_64_loop
+ VZEROUPPER
+
+mulGFNI_8x9_64_end:
+ RET
+
+// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x9(SB), $0-88
+ // Loading 5 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x9_end:
+ RET
+
+// func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x9_64Xor(SB), $0-88
+ // Loading 21 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z21
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z22
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z23
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 144(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 168(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 192(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z21, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88
+ // Loading 5 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y5
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 168(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 192(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x9Xor_end:
+ RET
+
+// func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 158 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x9Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_8x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ MOVQ 168(R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ MOVQ 192(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y12
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y12
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 3456(CX), Y10
+ VMOVDQU 3488(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 3520(CX), Y10
+ VMOVDQU 3552(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3584(CX), Y10
+ VMOVDQU 3616(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3648(CX), Y10
+ VMOVDQU 3680(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3712(CX), Y10
+ VMOVDQU 3744(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3776(CX), Y10
+ VMOVDQU 3808(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3840(CX), Y10
+ VMOVDQU 3872(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3904(CX), Y10
+ VMOVDQU 3936(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3968(CX), Y10
+ VMOVDQU 4000(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4032(CX), Y10
+ VMOVDQU 4064(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4096(CX), Y10
+ VMOVDQU 4128(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4160(CX), Y10
+ VMOVDQU 4192(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4224(CX), Y10
+ VMOVDQU 4256(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4288(CX), Y10
+ VMOVDQU 4320(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4352(CX), Y10
+ VMOVDQU 4384(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4416(CX), Y10
+ VMOVDQU 4448(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 4480(CX), Y10
+ VMOVDQU 4512(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 4544(CX), Y10
+ VMOVDQU 4576(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x9Xor_end:
+ RET
+
+// func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 175 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x10_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_8x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y0
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y1
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y2
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y3
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y4
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y5
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y6
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y7
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y8
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y9
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y13
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y13
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3840(CX), Y11
+ VMOVDQU 3872(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3904(CX), Y11
+ VMOVDQU 3936(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3968(CX), Y11
+ VMOVDQU 4000(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4032(CX), Y11
+ VMOVDQU 4064(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4096(CX), Y11
+ VMOVDQU 4128(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4160(CX), Y11
+ VMOVDQU 4192(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4224(CX), Y11
+ VMOVDQU 4256(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4288(CX), Y11
+ VMOVDQU 4320(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4352(CX), Y11
+ VMOVDQU 4384(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 4416(CX), Y11
+ VMOVDQU 4448(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 4480(CX), Y11
+ VMOVDQU 4512(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 4544(CX), Y11
+ VMOVDQU 4576(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 4608(CX), Y11
+ VMOVDQU 4640(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4672(CX), Y11
+ VMOVDQU 4704(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4736(CX), Y11
+ VMOVDQU 4768(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4800(CX), Y11
+ VMOVDQU 4832(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4864(CX), Y11
+ VMOVDQU 4896(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4928(CX), Y11
+ VMOVDQU 4960(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4992(CX), Y11
+ VMOVDQU 5024(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5056(CX), Y11
+ VMOVDQU 5088(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x10_loop
+ VZEROUPPER
+
+mulAvxTwo_8x10_end:
+ RET
+
+// func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x10_64(SB), $0-88
+ // Loading 20 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z20, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z21, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x10_64_loop
+ VZEROUPPER
+
+mulGFNI_8x10_64_end:
+ RET
+
+// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x10(SB), $0-88
+ // Loading 4 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x10_end:
+ RET
+
+// func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x10_64Xor(SB), $0-88
+ // Loading 20 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z20
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z21
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z22
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z23
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 144(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 168(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 192(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 216(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z20, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z21, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88
+ // Loading 4 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y4
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y5
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 168(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 192(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 216(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x10Xor_end:
+ RET
+
+// func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 175 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_8x10Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_8x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ MOVQ 168(R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ MOVQ 192(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ MOVQ 216(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y13
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y13
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3840(CX), Y11
+ VMOVDQU 3872(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3904(CX), Y11
+ VMOVDQU 3936(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3968(CX), Y11
+ VMOVDQU 4000(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4032(CX), Y11
+ VMOVDQU 4064(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4096(CX), Y11
+ VMOVDQU 4128(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4160(CX), Y11
+ VMOVDQU 4192(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4224(CX), Y11
+ VMOVDQU 4256(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4288(CX), Y11
+ VMOVDQU 4320(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4352(CX), Y11
+ VMOVDQU 4384(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 4416(CX), Y11
+ VMOVDQU 4448(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 4480(CX), Y11
+ VMOVDQU 4512(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 4544(CX), Y11
+ VMOVDQU 4576(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 4608(CX), Y11
+ VMOVDQU 4640(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4672(CX), Y11
+ VMOVDQU 4704(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4736(CX), Y11
+ VMOVDQU 4768(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4800(CX), Y11
+ VMOVDQU 4832(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4864(CX), Y11
+ VMOVDQU 4896(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4928(CX), Y11
+ VMOVDQU 4960(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4992(CX), Y11
+ VMOVDQU 5024(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5056(CX), Y11
+ VMOVDQU 5088(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y0, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y1, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y2, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y3, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxTwo_8x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_8x10Xor_end:
+ RET
+
+// func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x1_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x1_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_9x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ VPXOR Y3, Y4, Y0
+ VPXOR Y5, Y6, Y1
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (R10), Y6
+ VMOVDQU 32(R10), Y5
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU (R11), Y6
+ VMOVDQU 32(R11), Y5
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU (R12), Y6
+ VMOVDQU 32(R12), Y5
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 512(CX), Y3
+ VMOVDQU 544(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R13)
+ VMOVDQU Y1, 32(R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_9x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_9x1_64_end:
+ RET
+
+// func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, CX
+
+mulGFNI_9x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z10
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z9
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z10
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z10
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z10
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z10
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z10
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z10
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (R11), Z10
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z7, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Store 1 outputs
+ VMOVDQU64 Z9, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x1_64_loop
+ VZEROUPPER
+
+mulGFNI_9x1_64_end:
+ RET
+
+// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, CX
+
+mulAvxGFNI_9x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y9
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VGF2P8AFFINEQB $0x00, Y7, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 8 to 1 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y8, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Store 1 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x1_end:
+ RET
+
+// func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, CX
+
+mulGFNI_9x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R12), Z9
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z10
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z10
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z10
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z10
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z10
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z10
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z10
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (R11), Z10
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z7, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Store 1 outputs
+ VMOVDQU64 Z9, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, CX
+
+mulAvxGFNI_9x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R12), Y9
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VGF2P8AFFINEQB $0x00, Y7, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 8 to 1 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y8, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Store 1 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x1Xor_end:
+ RET
+
+// func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x1_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R14
+ MOVQ R14, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_9x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R13), Y0
+ VMOVDQU 32(R13), Y1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (R10), Y6
+ VMOVDQU 32(R10), Y5
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU (R11), Y6
+ VMOVDQU 32(R11), Y5
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU (R12), Y6
+ VMOVDQU 32(R12), Y5
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 512(CX), Y3
+ VMOVDQU 544(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R13)
+ VMOVDQU Y1, 32(R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_9x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x2_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 81 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_9x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VPXOR Y7, Y8, Y3
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU 32(R10), Y11
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y9
+ VMOVDQU 32(R11), Y11
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y9
+ VMOVDQU 32(R12), Y11
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R14)
+ VMOVDQU Y1, 32(R14)
+ ADDQ $0x40, R14
+ VMOVDQU Y2, (R13)
+ VMOVDQU Y3, 32(R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_9x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_9x2_64_end:
+ RET
+
+// func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, CX
+
+mulGFNI_9x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z19
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z20
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z20
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z20
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z20
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z20
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (R11), Z20
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z16, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z17, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 2 outputs
+ VMOVDQU64 Z18, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z19, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x2_64_loop
+ VZEROUPPER
+
+mulGFNI_9x2_64_end:
+ RET
+
+// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x2(SB), $0-88
+ // Loading 12 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, DX
+
+mulAvxGFNI_9x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x2_end:
+ RET
+
+// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, CX
+
+mulGFNI_9x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R13), Z18
+ VMOVDQU64 (R12), Z19
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z20
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z20
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z20
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z20
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z20
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (R11), Z20
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z16, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z17, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 2 outputs
+ VMOVDQU64 Z18, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z19, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88
+ // Loading 12 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, DX
+
+mulAvxGFNI_9x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R14), Y12
+ VMOVDQU (R13), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x2Xor_end:
+ RET
+
+// func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 81 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_9x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R14), Y0
+ VMOVDQU 32(R14), Y1
+ VMOVDQU (R13), Y2
+ VMOVDQU 32(R13), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU 32(R10), Y11
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y9
+ VMOVDQU 32(R11), Y11
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y9
+ VMOVDQU 32(R12), Y11
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R14)
+ VMOVDQU Y1, 32(R14)
+ ADDQ $0x40, R14
+ VMOVDQU Y2, (R13)
+ VMOVDQU Y3, 32(R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_9x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x3_64(SB), $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 118 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x3_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_9x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU 32(R10), Y13
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y11
+ VMOVDQU 32(R11), Y13
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU (R12), Y11
+ VMOVDQU 32(R12), Y13
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R14)
+ VMOVDQU Y1, 32(R14)
+ ADDQ $0x40, R14
+ VMOVDQU Y2, (R15)
+ VMOVDQU Y3, 32(R15)
+ ADDQ $0x40, R15
+ VMOVDQU Y4, (R13)
+ VMOVDQU Y5, 32(R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_9x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_9x3_64_end:
+ RET
+
+// func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ VBROADCASTF32X2 208(CX), Z26
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, CX
+
+mulGFNI_9x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 3 outputs
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x3_64_loop
+ VZEROUPPER
+
+mulGFNI_9x3_64_end:
+ RET
+
+// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x3(SB), $8-88
+ // Loading 11 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, DX
+
+mulAvxGFNI_9x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x3_end:
+ RET
+
+// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ VBROADCASTF32X2 208(CX), Z26
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, CX
+
+mulGFNI_9x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R12), Z29
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 3 outputs
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88
+ // Loading 11 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, DX
+
+mulAvxGFNI_9x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R13), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x3Xor_end:
+ RET
+
+// func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 118 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x3_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_9x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R14), Y0
+ VMOVDQU 32(R14), Y1
+ VMOVDQU (R15), Y2
+ VMOVDQU 32(R15), Y3
+ VMOVDQU (R13), Y4
+ VMOVDQU 32(R13), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU 32(R10), Y13
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y11
+ VMOVDQU 32(R11), Y13
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU (R12), Y11
+ VMOVDQU 32(R12), Y13
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R14)
+ VMOVDQU Y1, 32(R14)
+ ADDQ $0x40, R14
+ VMOVDQU Y2, (R15)
+ VMOVDQU Y3, 32(R15)
+ ADDQ $0x40, R15
+ VMOVDQU Y4, (R13)
+ VMOVDQU Y5, 32(R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_9x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 81 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x4_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X4
+ VPBROADCASTB X4, Y4
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_9x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y3
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R10), Y7
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1536(CX), Y5
+ VMOVDQU 1568(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1600(CX), Y5
+ VMOVDQU 1632(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1664(CX), Y5
+ VMOVDQU 1696(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1728(CX), Y5
+ VMOVDQU 1760(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R11), Y7
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1792(CX), Y5
+ VMOVDQU 1824(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1856(CX), Y5
+ VMOVDQU 1888(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1920(CX), Y5
+ VMOVDQU 1952(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1984(CX), Y5
+ VMOVDQU 2016(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (AX), Y7
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 2048(CX), Y5
+ VMOVDQU 2080(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 2112(CX), Y5
+ VMOVDQU 2144(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 2176(CX), Y5
+ VMOVDQU 2208(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 2240(CX), Y5
+ VMOVDQU 2272(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y1, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y2, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_9x4_loop
+ VZEROUPPER
+
+mulAvxTwo_9x4_end:
+ RET
+
+// func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x4_64(SB), $8-88
+ // Loading 26 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_9x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 4 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_9x4_64_loop
+ VZEROUPPER
+
+mulGFNI_9x4_64_end:
+ RET
+
+// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x4(SB), $8-88
+ // Loading 10 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_9x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_9x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x4_end:
+ RET
+
+// func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x4_64Xor(SB), $8-88
+ // Loading 26 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_9x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R12), Z29
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 4 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_9x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88
+ // Loading 10 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_9x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R12), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_9x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x4Xor_end:
+ RET
+
+// func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 81 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x4Xor_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X4
+ VPBROADCASTB X4, Y4
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxTwo_9x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (R13), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU (R14), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU (R15), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R12), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R10), Y7
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1536(CX), Y5
+ VMOVDQU 1568(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1600(CX), Y5
+ VMOVDQU 1632(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1664(CX), Y5
+ VMOVDQU 1696(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1728(CX), Y5
+ VMOVDQU 1760(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R11), Y7
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1792(CX), Y5
+ VMOVDQU 1824(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1856(CX), Y5
+ VMOVDQU 1888(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1920(CX), Y5
+ VMOVDQU 1952(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1984(CX), Y5
+ VMOVDQU 2016(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (AX), Y7
+ ADDQ $0x20, AX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 2048(CX), Y5
+ VMOVDQU 2080(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 2112(CX), Y5
+ VMOVDQU 2144(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 2176(CX), Y5
+ VMOVDQU 2208(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 2240(CX), Y5
+ VMOVDQU 2272(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ VMOVDQU Y0, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y1, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y2, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y3, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_9x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x4Xor_end:
+ RET
+
+// func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 100 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x5_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_9x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y4
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y8
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y8
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1920(CX), Y6
+ VMOVDQU 1952(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1984(CX), Y6
+ VMOVDQU 2016(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2048(CX), Y6
+ VMOVDQU 2080(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2112(CX), Y6
+ VMOVDQU 2144(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2176(CX), Y6
+ VMOVDQU 2208(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y8
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2240(CX), Y6
+ VMOVDQU 2272(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2304(CX), Y6
+ VMOVDQU 2336(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2368(CX), Y6
+ VMOVDQU 2400(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2432(CX), Y6
+ VMOVDQU 2464(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2496(CX), Y6
+ VMOVDQU 2528(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2560(CX), Y6
+ VMOVDQU 2592(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2624(CX), Y6
+ VMOVDQU 2656(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2688(CX), Y6
+ VMOVDQU 2720(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2752(CX), Y6
+ VMOVDQU 2784(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2816(CX), Y6
+ VMOVDQU 2848(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x5_loop
+ VZEROUPPER
+
+mulAvxTwo_9x5_end:
+ RET
+
+// func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x5_64(SB), $0-88
+ // Loading 25 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x5_64_loop
+ VZEROUPPER
+
+mulGFNI_9x5_64_end:
+ RET
+
+// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x5(SB), $0-88
+ // Loading 9 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x5_end:
+ RET
+
+// func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x5_64Xor(SB), $0-88
+ // Loading 25 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x5_64Xor_loop:
+ // Load 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88
+ // Loading 9 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x5Xor_loop:
+ // Load 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x5Xor_end:
+ RET
+
+// func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 100 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x5Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_9x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y8
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y8
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1920(CX), Y6
+ VMOVDQU 1952(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1984(CX), Y6
+ VMOVDQU 2016(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2048(CX), Y6
+ VMOVDQU 2080(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2112(CX), Y6
+ VMOVDQU 2144(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2176(CX), Y6
+ VMOVDQU 2208(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y8
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2240(CX), Y6
+ VMOVDQU 2272(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2304(CX), Y6
+ VMOVDQU 2336(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2368(CX), Y6
+ VMOVDQU 2400(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2432(CX), Y6
+ VMOVDQU 2464(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2496(CX), Y6
+ VMOVDQU 2528(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2560(CX), Y6
+ VMOVDQU 2592(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2624(CX), Y6
+ VMOVDQU 2656(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2688(CX), Y6
+ VMOVDQU 2720(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2752(CX), Y6
+ VMOVDQU 2784(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2816(CX), Y6
+ VMOVDQU 2848(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x5Xor_end:
+ RET
+
+// func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 119 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x6_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_9x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y5
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y9
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2304(CX), Y7
+ VMOVDQU 2336(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2368(CX), Y7
+ VMOVDQU 2400(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2432(CX), Y7
+ VMOVDQU 2464(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2496(CX), Y7
+ VMOVDQU 2528(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2560(CX), Y7
+ VMOVDQU 2592(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2624(CX), Y7
+ VMOVDQU 2656(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y9
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2688(CX), Y7
+ VMOVDQU 2720(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2752(CX), Y7
+ VMOVDQU 2784(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2816(CX), Y7
+ VMOVDQU 2848(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2880(CX), Y7
+ VMOVDQU 2912(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2944(CX), Y7
+ VMOVDQU 2976(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3008(CX), Y7
+ VMOVDQU 3040(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 3072(CX), Y7
+ VMOVDQU 3104(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 3136(CX), Y7
+ VMOVDQU 3168(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 3200(CX), Y7
+ VMOVDQU 3232(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 3264(CX), Y7
+ VMOVDQU 3296(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 3328(CX), Y7
+ VMOVDQU 3360(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3392(CX), Y7
+ VMOVDQU 3424(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x6_loop
+ VZEROUPPER
+
+mulAvxTwo_9x6_end:
+ RET
+
+// func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x6_64(SB), $0-88
+ // Loading 24 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x6_64_loop
+ VZEROUPPER
+
+mulGFNI_9x6_64_end:
+ RET
+
+// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x6(SB), $0-88
+ // Loading 8 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x6_end:
+ RET
+
+// func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x6_64Xor(SB), $0-88
+ // Loading 24 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x6_64Xor_loop:
+ // Load 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88
+ // Loading 8 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x6Xor_loop:
+ // Load 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x6Xor_end:
+ RET
+
+// func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 119 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x6Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_9x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y9
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2304(CX), Y7
+ VMOVDQU 2336(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2368(CX), Y7
+ VMOVDQU 2400(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2432(CX), Y7
+ VMOVDQU 2464(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2496(CX), Y7
+ VMOVDQU 2528(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2560(CX), Y7
+ VMOVDQU 2592(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2624(CX), Y7
+ VMOVDQU 2656(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y9
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2688(CX), Y7
+ VMOVDQU 2720(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2752(CX), Y7
+ VMOVDQU 2784(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2816(CX), Y7
+ VMOVDQU 2848(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2880(CX), Y7
+ VMOVDQU 2912(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2944(CX), Y7
+ VMOVDQU 2976(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3008(CX), Y7
+ VMOVDQU 3040(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 3072(CX), Y7
+ VMOVDQU 3104(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 3136(CX), Y7
+ VMOVDQU 3168(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 3200(CX), Y7
+ VMOVDQU 3232(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 3264(CX), Y7
+ VMOVDQU 3296(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 3328(CX), Y7
+ VMOVDQU 3360(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3392(CX), Y7
+ VMOVDQU 3424(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x6Xor_end:
+ RET
+
+// func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 138 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x7_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_9x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y6
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2688(CX), Y8
+ VMOVDQU 2720(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2752(CX), Y8
+ VMOVDQU 2784(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2816(CX), Y8
+ VMOVDQU 2848(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2880(CX), Y8
+ VMOVDQU 2912(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2944(CX), Y8
+ VMOVDQU 2976(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3008(CX), Y8
+ VMOVDQU 3040(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3072(CX), Y8
+ VMOVDQU 3104(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y10
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3136(CX), Y8
+ VMOVDQU 3168(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3200(CX), Y8
+ VMOVDQU 3232(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3264(CX), Y8
+ VMOVDQU 3296(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3328(CX), Y8
+ VMOVDQU 3360(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3392(CX), Y8
+ VMOVDQU 3424(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3456(CX), Y8
+ VMOVDQU 3488(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3520(CX), Y8
+ VMOVDQU 3552(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3584(CX), Y8
+ VMOVDQU 3616(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3648(CX), Y8
+ VMOVDQU 3680(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3712(CX), Y8
+ VMOVDQU 3744(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3776(CX), Y8
+ VMOVDQU 3808(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3840(CX), Y8
+ VMOVDQU 3872(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3904(CX), Y8
+ VMOVDQU 3936(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3968(CX), Y8
+ VMOVDQU 4000(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x7_loop
+ VZEROUPPER
+
+mulAvxTwo_9x7_end:
+ RET
+
+// func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x7_64(SB), $0-88
+ // Loading 23 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x7_64_loop
+ VZEROUPPER
+
+mulGFNI_9x7_64_end:
+ RET
+
+// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x7(SB), $0-88
+ // Loading 7 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x7_end:
+ RET
+
+// func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x7_64Xor(SB), $0-88
+ // Loading 23 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x7_64Xor_loop:
+ // Load 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z23
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 144(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88
+ // Loading 7 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x7Xor_loop:
+ // Load 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x7Xor_end:
+ RET
+
+// func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 138 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x7Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_9x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2688(CX), Y8
+ VMOVDQU 2720(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2752(CX), Y8
+ VMOVDQU 2784(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2816(CX), Y8
+ VMOVDQU 2848(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2880(CX), Y8
+ VMOVDQU 2912(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2944(CX), Y8
+ VMOVDQU 2976(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3008(CX), Y8
+ VMOVDQU 3040(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3072(CX), Y8
+ VMOVDQU 3104(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y10
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3136(CX), Y8
+ VMOVDQU 3168(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3200(CX), Y8
+ VMOVDQU 3232(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3264(CX), Y8
+ VMOVDQU 3296(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3328(CX), Y8
+ VMOVDQU 3360(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3392(CX), Y8
+ VMOVDQU 3424(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3456(CX), Y8
+ VMOVDQU 3488(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3520(CX), Y8
+ VMOVDQU 3552(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3584(CX), Y8
+ VMOVDQU 3616(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3648(CX), Y8
+ VMOVDQU 3680(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3712(CX), Y8
+ VMOVDQU 3744(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3776(CX), Y8
+ VMOVDQU 3808(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3840(CX), Y8
+ VMOVDQU 3872(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3904(CX), Y8
+ VMOVDQU 3936(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3968(CX), Y8
+ VMOVDQU 4000(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x7Xor_end:
+ RET
+
+// func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 157 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x8_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_9x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y7
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3072(CX), Y9
+ VMOVDQU 3104(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3136(CX), Y9
+ VMOVDQU 3168(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3200(CX), Y9
+ VMOVDQU 3232(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3264(CX), Y9
+ VMOVDQU 3296(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3328(CX), Y9
+ VMOVDQU 3360(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3392(CX), Y9
+ VMOVDQU 3424(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3456(CX), Y9
+ VMOVDQU 3488(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3520(CX), Y9
+ VMOVDQU 3552(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y11
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3584(CX), Y9
+ VMOVDQU 3616(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3648(CX), Y9
+ VMOVDQU 3680(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3712(CX), Y9
+ VMOVDQU 3744(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3776(CX), Y9
+ VMOVDQU 3808(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3840(CX), Y9
+ VMOVDQU 3872(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3904(CX), Y9
+ VMOVDQU 3936(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3968(CX), Y9
+ VMOVDQU 4000(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4032(CX), Y9
+ VMOVDQU 4064(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 4096(CX), Y9
+ VMOVDQU 4128(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 4160(CX), Y9
+ VMOVDQU 4192(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 4224(CX), Y9
+ VMOVDQU 4256(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 4288(CX), Y9
+ VMOVDQU 4320(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 4352(CX), Y9
+ VMOVDQU 4384(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 4416(CX), Y9
+ VMOVDQU 4448(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 4480(CX), Y9
+ VMOVDQU 4512(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4544(CX), Y9
+ VMOVDQU 4576(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x8_loop
+ VZEROUPPER
+
+mulAvxTwo_9x8_end:
+ RET
+
+// func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x8_64(SB), $0-88
+ // Loading 22 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x8_64_loop
+ VZEROUPPER
+
+mulGFNI_9x8_64_end:
+ RET
+
+// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x8(SB), $0-88
+ // Loading 6 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x8_end:
+ RET
+
+// func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x8_64Xor(SB), $0-88
+ // Loading 22 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z22
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z23
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 144(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 168(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88
+ // Loading 6 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 168(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x8Xor_end:
+ RET
+
+// func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 157 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x8Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_9x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ MOVQ 168(R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3072(CX), Y9
+ VMOVDQU 3104(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3136(CX), Y9
+ VMOVDQU 3168(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3200(CX), Y9
+ VMOVDQU 3232(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3264(CX), Y9
+ VMOVDQU 3296(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3328(CX), Y9
+ VMOVDQU 3360(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3392(CX), Y9
+ VMOVDQU 3424(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3456(CX), Y9
+ VMOVDQU 3488(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3520(CX), Y9
+ VMOVDQU 3552(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y11
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3584(CX), Y9
+ VMOVDQU 3616(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3648(CX), Y9
+ VMOVDQU 3680(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3712(CX), Y9
+ VMOVDQU 3744(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3776(CX), Y9
+ VMOVDQU 3808(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3840(CX), Y9
+ VMOVDQU 3872(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3904(CX), Y9
+ VMOVDQU 3936(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3968(CX), Y9
+ VMOVDQU 4000(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4032(CX), Y9
+ VMOVDQU 4064(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 4096(CX), Y9
+ VMOVDQU 4128(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 4160(CX), Y9
+ VMOVDQU 4192(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 4224(CX), Y9
+ VMOVDQU 4256(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 4288(CX), Y9
+ VMOVDQU 4320(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 4352(CX), Y9
+ VMOVDQU 4384(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 4416(CX), Y9
+ VMOVDQU 4448(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 4480(CX), Y9
+ VMOVDQU 4512(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4544(CX), Y9
+ VMOVDQU 4576(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x8Xor_end:
+ RET
+
+// func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 176 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x9_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_9x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y0
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y1
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y2
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y3
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y4
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y5
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y6
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y7
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y8
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y12
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y12
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 3456(CX), Y10
+ VMOVDQU 3488(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 3520(CX), Y10
+ VMOVDQU 3552(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3584(CX), Y10
+ VMOVDQU 3616(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3648(CX), Y10
+ VMOVDQU 3680(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3712(CX), Y10
+ VMOVDQU 3744(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3776(CX), Y10
+ VMOVDQU 3808(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3840(CX), Y10
+ VMOVDQU 3872(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3904(CX), Y10
+ VMOVDQU 3936(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3968(CX), Y10
+ VMOVDQU 4000(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y12
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4032(CX), Y10
+ VMOVDQU 4064(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4096(CX), Y10
+ VMOVDQU 4128(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4160(CX), Y10
+ VMOVDQU 4192(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4224(CX), Y10
+ VMOVDQU 4256(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4288(CX), Y10
+ VMOVDQU 4320(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4352(CX), Y10
+ VMOVDQU 4384(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4416(CX), Y10
+ VMOVDQU 4448(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 4480(CX), Y10
+ VMOVDQU 4512(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 4544(CX), Y10
+ VMOVDQU 4576(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4608(CX), Y10
+ VMOVDQU 4640(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4672(CX), Y10
+ VMOVDQU 4704(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4736(CX), Y10
+ VMOVDQU 4768(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4800(CX), Y10
+ VMOVDQU 4832(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4864(CX), Y10
+ VMOVDQU 4896(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4928(CX), Y10
+ VMOVDQU 4960(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4992(CX), Y10
+ VMOVDQU 5024(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 5056(CX), Y10
+ VMOVDQU 5088(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 5120(CX), Y10
+ VMOVDQU 5152(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x9_loop
+ VZEROUPPER
+
+mulAvxTwo_9x9_end:
+ RET
+
+// func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x9_64(SB), $0-88
+ // Loading 21 of 81 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z21, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x9_64_loop
+ VZEROUPPER
+
+mulGFNI_9x9_64_end:
+ RET
+
+// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x9(SB), $0-88
+ // Loading 5 of 81 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x9_end:
+ RET
+
+// func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x9_64Xor(SB), $0-88
+ // Loading 21 of 81 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z21
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z22
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z23
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 144(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 168(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 192(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z21, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88
+ // Loading 5 of 81 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y5
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 168(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 192(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x9Xor_end:
+ RET
+
+// func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 176 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x9Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_9x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ MOVQ 168(R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ MOVQ 192(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y12
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y12
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 3456(CX), Y10
+ VMOVDQU 3488(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 3520(CX), Y10
+ VMOVDQU 3552(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3584(CX), Y10
+ VMOVDQU 3616(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3648(CX), Y10
+ VMOVDQU 3680(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3712(CX), Y10
+ VMOVDQU 3744(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3776(CX), Y10
+ VMOVDQU 3808(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3840(CX), Y10
+ VMOVDQU 3872(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3904(CX), Y10
+ VMOVDQU 3936(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3968(CX), Y10
+ VMOVDQU 4000(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y12
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4032(CX), Y10
+ VMOVDQU 4064(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4096(CX), Y10
+ VMOVDQU 4128(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4160(CX), Y10
+ VMOVDQU 4192(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4224(CX), Y10
+ VMOVDQU 4256(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4288(CX), Y10
+ VMOVDQU 4320(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4352(CX), Y10
+ VMOVDQU 4384(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4416(CX), Y10
+ VMOVDQU 4448(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 4480(CX), Y10
+ VMOVDQU 4512(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 4544(CX), Y10
+ VMOVDQU 4576(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4608(CX), Y10
+ VMOVDQU 4640(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4672(CX), Y10
+ VMOVDQU 4704(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4736(CX), Y10
+ VMOVDQU 4768(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4800(CX), Y10
+ VMOVDQU 4832(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4864(CX), Y10
+ VMOVDQU 4896(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4928(CX), Y10
+ VMOVDQU 4960(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4992(CX), Y10
+ VMOVDQU 5024(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 5056(CX), Y10
+ VMOVDQU 5088(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 5120(CX), Y10
+ VMOVDQU 5152(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x9Xor_end:
+ RET
+
+// func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 195 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x10_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_9x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y0
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y1
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y2
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y3
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y4
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y5
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y6
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y7
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y8
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y9
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y13
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y13
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3840(CX), Y11
+ VMOVDQU 3872(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3904(CX), Y11
+ VMOVDQU 3936(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3968(CX), Y11
+ VMOVDQU 4000(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4032(CX), Y11
+ VMOVDQU 4064(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4096(CX), Y11
+ VMOVDQU 4128(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4160(CX), Y11
+ VMOVDQU 4192(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4224(CX), Y11
+ VMOVDQU 4256(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4288(CX), Y11
+ VMOVDQU 4320(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4352(CX), Y11
+ VMOVDQU 4384(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 4416(CX), Y11
+ VMOVDQU 4448(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y13
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 4480(CX), Y11
+ VMOVDQU 4512(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 4544(CX), Y11
+ VMOVDQU 4576(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 4608(CX), Y11
+ VMOVDQU 4640(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4672(CX), Y11
+ VMOVDQU 4704(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4736(CX), Y11
+ VMOVDQU 4768(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4800(CX), Y11
+ VMOVDQU 4832(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4864(CX), Y11
+ VMOVDQU 4896(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4928(CX), Y11
+ VMOVDQU 4960(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4992(CX), Y11
+ VMOVDQU 5024(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5056(CX), Y11
+ VMOVDQU 5088(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 5120(CX), Y11
+ VMOVDQU 5152(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 5184(CX), Y11
+ VMOVDQU 5216(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 5248(CX), Y11
+ VMOVDQU 5280(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 5312(CX), Y11
+ VMOVDQU 5344(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 5376(CX), Y11
+ VMOVDQU 5408(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 5440(CX), Y11
+ VMOVDQU 5472(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 5504(CX), Y11
+ VMOVDQU 5536(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 5568(CX), Y11
+ VMOVDQU 5600(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 5632(CX), Y11
+ VMOVDQU 5664(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5696(CX), Y11
+ VMOVDQU 5728(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x10_loop
+ VZEROUPPER
+
+mulAvxTwo_9x10_end:
+ RET
+
+// func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x10_64(SB), $0-88
+ // Loading 20 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 102 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z20, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z21, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x10_64_loop
+ VZEROUPPER
+
+mulGFNI_9x10_64_end:
+ RET
+
+// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x10(SB), $0-88
+ // Loading 4 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 102 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x10_end:
+ RET
+
+// func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x10_64Xor(SB), $0-88
+ // Loading 20 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 102 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z20
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z21
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z22
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z23
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 144(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 168(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 192(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 216(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z20, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z21, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88
+ // Loading 4 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 102 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y4
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y5
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 168(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 192(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 216(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x10Xor_end:
+ RET
+
+// func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 195 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_9x10Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_9x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ MOVQ 168(R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ MOVQ 192(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ MOVQ 216(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y13
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y13
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3840(CX), Y11
+ VMOVDQU 3872(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3904(CX), Y11
+ VMOVDQU 3936(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3968(CX), Y11
+ VMOVDQU 4000(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4032(CX), Y11
+ VMOVDQU 4064(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4096(CX), Y11
+ VMOVDQU 4128(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4160(CX), Y11
+ VMOVDQU 4192(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4224(CX), Y11
+ VMOVDQU 4256(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4288(CX), Y11
+ VMOVDQU 4320(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4352(CX), Y11
+ VMOVDQU 4384(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 4416(CX), Y11
+ VMOVDQU 4448(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y13
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 4480(CX), Y11
+ VMOVDQU 4512(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 4544(CX), Y11
+ VMOVDQU 4576(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 4608(CX), Y11
+ VMOVDQU 4640(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4672(CX), Y11
+ VMOVDQU 4704(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4736(CX), Y11
+ VMOVDQU 4768(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4800(CX), Y11
+ VMOVDQU 4832(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4864(CX), Y11
+ VMOVDQU 4896(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4928(CX), Y11
+ VMOVDQU 4960(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4992(CX), Y11
+ VMOVDQU 5024(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5056(CX), Y11
+ VMOVDQU 5088(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 5120(CX), Y11
+ VMOVDQU 5152(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 5184(CX), Y11
+ VMOVDQU 5216(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 5248(CX), Y11
+ VMOVDQU 5280(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 5312(CX), Y11
+ VMOVDQU 5344(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 5376(CX), Y11
+ VMOVDQU 5408(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 5440(CX), Y11
+ VMOVDQU 5472(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 5504(CX), Y11
+ VMOVDQU 5536(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 5568(CX), Y11
+ VMOVDQU 5600(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 5632(CX), Y11
+ VMOVDQU 5664(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5696(CX), Y11
+ VMOVDQU 5728(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y0, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y1, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y2, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y3, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxTwo_9x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_9x10Xor_end:
+ RET
+
+// func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x1_64(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x1_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ (R14), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_10x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ VPXOR Y3, Y4, Y0
+ VPXOR Y5, Y6, Y1
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (R10), Y6
+ VMOVDQU 32(R10), Y5
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU (R11), Y6
+ VMOVDQU 32(R11), Y5
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU (R12), Y6
+ VMOVDQU 32(R12), Y5
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU (R13), Y6
+ VMOVDQU 32(R13), Y5
+ ADDQ $0x40, R13
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 512(CX), Y3
+ VMOVDQU 544(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 9 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 576(CX), Y3
+ VMOVDQU 608(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R14)
+ VMOVDQU Y1, 32(R14)
+ ADDQ $0x40, R14
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_10x1_64_loop
+ VZEROUPPER
+
+mulAvxTwo_10x1_64_end:
+ RET
+
+// func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 13 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, CX
+
+mulGFNI_10x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z11
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z11, Z10
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z11
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z11
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z11
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z11
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z11
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z11
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (R11), Z11
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z7, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU64 (R12), Z11
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z8, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 9 to 1 outputs
+ VMOVDQU64 (CX), Z11
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Store 1 outputs
+ VMOVDQU64 Z10, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_10x1_64_loop
+ VZEROUPPER
+
+mulGFNI_10x1_64_end:
+ RET
+
+// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 13 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, CX
+
+mulAvxGFNI_10x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y11, Y10
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VGF2P8AFFINEQB $0x00, Y7, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 8 to 1 outputs
+ VMOVDQU (R12), Y11
+ ADDQ $0x20, R12
+ VGF2P8AFFINEQB $0x00, Y8, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 9 to 1 outputs
+ VMOVDQU (CX), Y11
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y9, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Store 1 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_10x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x1_end:
+ RET
+
+// func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 13 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, CX
+
+mulGFNI_10x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R13), Z10
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z11
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z11
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z11
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z11
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z11
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z11
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z11
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (R11), Z11
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z7, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU64 (R12), Z11
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z8, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 9 to 1 outputs
+ VMOVDQU64 (CX), Z11
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Store 1 outputs
+ VMOVDQU64 Z10, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_10x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 13 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, CX
+
+mulAvxGFNI_10x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R13), Y10
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VGF2P8AFFINEQB $0x00, Y7, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 8 to 1 outputs
+ VMOVDQU (R12), Y11
+ ADDQ $0x20, R12
+ VGF2P8AFFINEQB $0x00, Y8, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 9 to 1 outputs
+ VMOVDQU (CX), Y11
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y9, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Store 1 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_10x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x1Xor_end:
+ RET
+
+// func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x1_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ (R14), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, R15
+ MOVQ R15, X2
+ VPBROADCASTB X2, Y2
+
+mulAvxTwo_10x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R14), Y0
+ VMOVDQU 32(R14), Y1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU 32(BX), Y5
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU 32(SI), Y5
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(DI), Y5
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU 32(R8), Y5
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU 32(R9), Y5
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU (R10), Y6
+ VMOVDQU 32(R10), Y5
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU (R11), Y6
+ VMOVDQU 32(R11), Y5
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU (R12), Y6
+ VMOVDQU 32(R12), Y5
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU (R13), Y6
+ VMOVDQU 32(R13), Y5
+ ADDQ $0x40, R13
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 512(CX), Y3
+ VMOVDQU 544(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Load and process 64 bytes from input 9 to 1 outputs
+ VMOVDQU (DX), Y6
+ VMOVDQU 32(DX), Y5
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y6, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPAND Y2, Y6, Y6
+ VPAND Y2, Y5, Y5
+ VPAND Y2, Y7, Y7
+ VPAND Y2, Y8, Y8
+ VMOVDQU 576(CX), Y3
+ VMOVDQU 608(CX), Y4
+ VPSHUFB Y5, Y3, Y5
+ VPSHUFB Y6, Y3, Y3
+ VPSHUFB Y8, Y4, Y6
+ VPSHUFB Y7, Y4, Y4
+ XOR3WAY( $0x00, Y3, Y4, Y0)
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+
+ // Store 1 outputs
+ VMOVDQU Y0, (R14)
+ VMOVDQU Y1, 32(R14)
+ ADDQ $0x40, R14
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_10x1_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x1_64Xor_end:
+ RET
+
+// func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x2_64(SB), $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 89 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x2_64_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ (R14), R15
+ MOVQ 24(R14), R14
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R15
+ ADDQ BP, R14
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_10x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VPXOR Y7, Y8, Y3
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU 32(R10), Y11
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y9
+ VMOVDQU 32(R11), Y11
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y9
+ VMOVDQU 32(R12), Y11
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU (R13), Y9
+ VMOVDQU 32(R13), Y11
+ ADDQ $0x40, R13
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 9 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R15)
+ VMOVDQU Y1, 32(R15)
+ ADDQ $0x40, R15
+ VMOVDQU Y2, (R14)
+ VMOVDQU Y3, 32(R14)
+ ADDQ $0x40, R14
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_10x2_64_loop
+ VZEROUPPER
+
+mulAvxTwo_10x2_64_end:
+ RET
+
+// func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 24 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, CX
+
+mulGFNI_10x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z22
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z22, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z22, Z21
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z22
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z22
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z22
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z7, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z22
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z22
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z22
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (R11), Z22
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z14, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z15, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU64 (R12), Z22
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z16, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z17, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 9 to 2 outputs
+ VMOVDQU64 (CX), Z22
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z19, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Store 2 outputs
+ VMOVDQU64 Z20, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z21, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_10x2_64_loop
+ VZEROUPPER
+
+mulGFNI_10x2_64_end:
+ RET
+
+// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x2(SB), $8-88
+ // Loading 12 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 24 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ (R14), R15
+ MOVQ 24(R14), R14
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R15
+ ADDQ BP, R14
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, DX
+
+mulAvxGFNI_10x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 2 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R14)
+ ADDQ $0x20, R14
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_10x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x2_end:
+ RET
+
+// func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 24 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, CX
+
+mulGFNI_10x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R14), Z20
+ VMOVDQU64 (R13), Z21
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z22
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z22
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z22
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z22
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z7, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z22
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z22
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z22
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (R11), Z22
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z14, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z15, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU64 (R12), Z22
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z16, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z17, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 9 to 2 outputs
+ VMOVDQU64 (CX), Z22
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z19, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Store 2 outputs
+ VMOVDQU64 Z20, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z21, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_10x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88
+ // Loading 12 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 24 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ (R14), R15
+ MOVQ 24(R14), R14
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R15
+ ADDQ BP, R14
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, DX
+
+mulAvxGFNI_10x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R15), Y12
+ VMOVDQU (R14), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 2 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R14)
+ ADDQ $0x20, R14
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_10x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x2Xor_end:
+ RET
+
+// func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 89 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x2_64Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ (R14), R15
+ MOVQ 24(R14), R14
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R15
+ ADDQ BP, R14
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_10x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R15), Y0
+ VMOVDQU 32(R15), Y1
+ VMOVDQU (R14), Y2
+ VMOVDQU 32(R14), Y3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y9
+ VMOVDQU 32(BX), Y11
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y9
+ VMOVDQU 32(SI), Y11
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU 32(DI), Y11
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y11
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU 32(R9), Y11
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU 32(R10), Y11
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y9
+ VMOVDQU 32(R11), Y11
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y9
+ VMOVDQU 32(R12), Y11
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU (R13), Y9
+ VMOVDQU 32(R13), Y11
+ ADDQ $0x40, R13
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Load and process 64 bytes from input 9 to 2 outputs
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y11
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y4, Y9, Y9
+ VPAND Y4, Y11, Y11
+ VPAND Y4, Y10, Y10
+ VPAND Y4, Y12, Y12
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y11, Y5, Y7
+ VPSHUFB Y9, Y5, Y5
+ VPSHUFB Y12, Y6, Y8
+ VPSHUFB Y10, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // Store 2 outputs
+ VMOVDQU Y0, (R15)
+ VMOVDQU Y1, 32(R15)
+ ADDQ $0x40, R15
+ VMOVDQU Y2, (R14)
+ VMOVDQU Y3, 32(R14)
+ ADDQ $0x40, R14
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxTwo_10x2_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x2_64Xor_end:
+ RET
+
+// func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x3_64(SB), $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 130 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x3_64_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulAvxTwo_10x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VPXOR Y9, Y10, Y5
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU 32(R10), Y13
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU (R11), Y11
+ VMOVDQU 32(R11), Y13
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU (R12), Y11
+ VMOVDQU 32(R12), Y13
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 9 to 3 outputs
+ VMOVDQU (AX), Y11
+ VMOVDQU 32(AX), Y13
+ ADDQ $0x40, AX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R14)
+ VMOVDQU Y1, 32(R14)
+ ADDQ $0x40, R14
+ VMOVDQU Y2, (R15)
+ VMOVDQU Y3, 32(R15)
+ ADDQ $0x40, R15
+ VMOVDQU Y4, (R13)
+ VMOVDQU Y5, 32(R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_10x3_64_loop
+ VZEROUPPER
+
+mulAvxTwo_10x3_64_end:
+ RET
+
+// func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x3_64(SB), $8-88
+ // Loading 27 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ VBROADCASTF32X2 208(CX), Z26
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_10x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 3 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 3 outputs
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_10x3_64_loop
+ VZEROUPPER
+
+mulGFNI_10x3_64_end:
+ RET
+
+// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x3(SB), $8-88
+ // Loading 11 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_10x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 3 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 3 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_10x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x3_end:
+ RET
+
+// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x3_64Xor(SB), $8-88
+ // Loading 27 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ VBROADCASTF32X2 208(CX), Z26
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_10x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R13), Z29
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 3 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 3 outputs
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_10x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88
+ // Loading 11 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_10x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R13), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 3 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 3 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_10x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x3Xor_end:
+ RET
+
+// func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88
+ // Loading no tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 130 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x3_64Xor_end
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulAvxTwo_10x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R14), Y0
+ VMOVDQU 32(R14), Y1
+ VMOVDQU (R15), Y2
+ VMOVDQU 32(R15), Y3
+ VMOVDQU (R13), Y4
+ VMOVDQU 32(R13), Y5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y13
+ ADDQ $0x40, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y11
+ VMOVDQU 32(BX), Y13
+ ADDQ $0x40, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU (SI), Y11
+ VMOVDQU 32(SI), Y13
+ ADDQ $0x40, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU (DI), Y11
+ VMOVDQU 32(DI), Y13
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU (R8), Y11
+ VMOVDQU 32(R8), Y13
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU 32(R9), Y13
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU 32(R10), Y13
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU (R11), Y11
+ VMOVDQU 32(R11), Y13
+ ADDQ $0x40, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU (R12), Y11
+ VMOVDQU 32(R12), Y13
+ ADDQ $0x40, R12
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Load and process 64 bytes from input 9 to 3 outputs
+ VMOVDQU (AX), Y11
+ VMOVDQU 32(AX), Y13
+ ADDQ $0x40, AX
+ VPSRLQ $0x04, Y11, Y12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y6, Y11, Y11
+ VPAND Y6, Y13, Y13
+ VPAND Y6, Y12, Y12
+ VPAND Y6, Y14, Y14
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y13, Y7, Y9
+ VPSHUFB Y11, Y7, Y7
+ VPSHUFB Y14, Y8, Y10
+ VPSHUFB Y12, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // Store 3 outputs
+ VMOVDQU Y0, (R14)
+ VMOVDQU Y1, 32(R14)
+ ADDQ $0x40, R14
+ VMOVDQU Y2, (R15)
+ VMOVDQU Y3, 32(R15)
+ ADDQ $0x40, R15
+ VMOVDQU Y4, (R13)
+ VMOVDQU Y5, 32(R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxTwo_10x3_64Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x3_64Xor_end:
+ RET
+
+// func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 89 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x4_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_10x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y0
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y1
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y2
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ VPXOR Y5, Y6, Y3
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y7
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y7
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1536(CX), Y5
+ VMOVDQU 1568(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1600(CX), Y5
+ VMOVDQU 1632(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1664(CX), Y5
+ VMOVDQU 1696(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1728(CX), Y5
+ VMOVDQU 1760(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R12), Y7
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1792(CX), Y5
+ VMOVDQU 1824(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1856(CX), Y5
+ VMOVDQU 1888(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1920(CX), Y5
+ VMOVDQU 1952(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1984(CX), Y5
+ VMOVDQU 2016(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (R13), Y7
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 2048(CX), Y5
+ VMOVDQU 2080(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 2112(CX), Y5
+ VMOVDQU 2144(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 2176(CX), Y5
+ VMOVDQU 2208(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 2240(CX), Y5
+ VMOVDQU 2272(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 9 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 2304(CX), Y5
+ VMOVDQU 2336(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 2368(CX), Y5
+ VMOVDQU 2400(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 2432(CX), Y5
+ VMOVDQU 2464(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 2496(CX), Y5
+ VMOVDQU 2528(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x4_loop
+ VZEROUPPER
+
+mulAvxTwo_10x4_end:
+ RET
+
+// func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x4_64(SB), $8-88
+ // Loading 26 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 4 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x4_64_loop
+ VZEROUPPER
+
+mulGFNI_10x4_64_end:
+ RET
+
+// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x4(SB), $8-88
+ // Loading 10 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x4_end:
+ RET
+
+// func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x4_64Xor(SB), $8-88
+ // Loading 26 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x4_64Xor_loop:
+ // Load 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 4 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88
+ // Loading 10 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x4Xor_loop:
+ // Load 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x4Xor_end:
+ RET
+
+// func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 89 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x4Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X4
+ VPBROADCASTB X4, Y4
+
+mulAvxTwo_10x4Xor_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y0
+ VMOVDQU (CX), Y5
+ VMOVDQU 32(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y1
+ VMOVDQU 64(CX), Y5
+ VMOVDQU 96(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y2
+ VMOVDQU 128(CX), Y5
+ VMOVDQU 160(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y3
+ VMOVDQU 192(CX), Y5
+ VMOVDQU 224(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 256(CX), Y5
+ VMOVDQU 288(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 320(CX), Y5
+ VMOVDQU 352(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 384(CX), Y5
+ VMOVDQU 416(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 448(CX), Y5
+ VMOVDQU 480(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 512(CX), Y5
+ VMOVDQU 544(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 576(CX), Y5
+ VMOVDQU 608(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 640(CX), Y5
+ VMOVDQU 672(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 704(CX), Y5
+ VMOVDQU 736(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 768(CX), Y5
+ VMOVDQU 800(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 832(CX), Y5
+ VMOVDQU 864(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 896(CX), Y5
+ VMOVDQU 928(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 960(CX), Y5
+ VMOVDQU 992(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y7
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1024(CX), Y5
+ VMOVDQU 1056(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1088(CX), Y5
+ VMOVDQU 1120(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1152(CX), Y5
+ VMOVDQU 1184(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1216(CX), Y5
+ VMOVDQU 1248(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y7
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1280(CX), Y5
+ VMOVDQU 1312(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1344(CX), Y5
+ VMOVDQU 1376(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1408(CX), Y5
+ VMOVDQU 1440(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1472(CX), Y5
+ VMOVDQU 1504(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y7
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1536(CX), Y5
+ VMOVDQU 1568(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1600(CX), Y5
+ VMOVDQU 1632(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1664(CX), Y5
+ VMOVDQU 1696(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1728(CX), Y5
+ VMOVDQU 1760(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R12), Y7
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 1792(CX), Y5
+ VMOVDQU 1824(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 1856(CX), Y5
+ VMOVDQU 1888(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 1920(CX), Y5
+ VMOVDQU 1952(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 1984(CX), Y5
+ VMOVDQU 2016(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (R13), Y7
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 2048(CX), Y5
+ VMOVDQU 2080(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 2112(CX), Y5
+ VMOVDQU 2144(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 2176(CX), Y5
+ VMOVDQU 2208(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 2240(CX), Y5
+ VMOVDQU 2272(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Load and process 32 bytes from input 9 to 4 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y7, Y8
+ VPAND Y4, Y7, Y7
+ VPAND Y4, Y8, Y8
+ VMOVDQU 2304(CX), Y5
+ VMOVDQU 2336(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y0)
+ VMOVDQU 2368(CX), Y5
+ VMOVDQU 2400(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y1)
+ VMOVDQU 2432(CX), Y5
+ VMOVDQU 2464(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU 2496(CX), Y5
+ VMOVDQU 2528(CX), Y6
+ VPSHUFB Y7, Y5, Y5
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y3)
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x4Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x4Xor_end:
+ RET
+
+// func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 110 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x5_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_10x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y0
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y1
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y2
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y3
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ VPXOR Y6, Y7, Y4
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y8
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y8
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1920(CX), Y6
+ VMOVDQU 1952(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1984(CX), Y6
+ VMOVDQU 2016(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2048(CX), Y6
+ VMOVDQU 2080(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2112(CX), Y6
+ VMOVDQU 2144(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2176(CX), Y6
+ VMOVDQU 2208(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y8
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2240(CX), Y6
+ VMOVDQU 2272(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2304(CX), Y6
+ VMOVDQU 2336(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2368(CX), Y6
+ VMOVDQU 2400(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2432(CX), Y6
+ VMOVDQU 2464(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2496(CX), Y6
+ VMOVDQU 2528(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (R13), Y8
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2560(CX), Y6
+ VMOVDQU 2592(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2624(CX), Y6
+ VMOVDQU 2656(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2688(CX), Y6
+ VMOVDQU 2720(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2752(CX), Y6
+ VMOVDQU 2784(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2816(CX), Y6
+ VMOVDQU 2848(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 9 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2880(CX), Y6
+ VMOVDQU 2912(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2944(CX), Y6
+ VMOVDQU 2976(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 3008(CX), Y6
+ VMOVDQU 3040(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 3072(CX), Y6
+ VMOVDQU 3104(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 3136(CX), Y6
+ VMOVDQU 3168(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x5_loop
+ VZEROUPPER
+
+mulAvxTwo_10x5_end:
+ RET
+
+// func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x5_64(SB), $8-88
+ // Loading 25 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 5 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x5_64_loop
+ VZEROUPPER
+
+mulGFNI_10x5_64_end:
+ RET
+
+// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x5(SB), $8-88
+ // Loading 9 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x5_end:
+ RET
+
+// func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x5_64Xor(SB), $8-88
+ // Loading 25 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x5_64Xor_loop:
+ // Load 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 5 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88
+ // Loading 9 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x5Xor_loop:
+ // Load 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x5Xor_end:
+ RET
+
+// func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 110 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x5Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X5
+ VPBROADCASTB X5, Y5
+
+mulAvxTwo_10x5Xor_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y0
+ VMOVDQU (CX), Y6
+ VMOVDQU 32(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y1
+ VMOVDQU 64(CX), Y6
+ VMOVDQU 96(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y2
+ VMOVDQU 128(CX), Y6
+ VMOVDQU 160(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y3
+ VMOVDQU 192(CX), Y6
+ VMOVDQU 224(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y4
+ VMOVDQU 256(CX), Y6
+ VMOVDQU 288(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 320(CX), Y6
+ VMOVDQU 352(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 384(CX), Y6
+ VMOVDQU 416(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 448(CX), Y6
+ VMOVDQU 480(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 512(CX), Y6
+ VMOVDQU 544(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 576(CX), Y6
+ VMOVDQU 608(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 640(CX), Y6
+ VMOVDQU 672(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 704(CX), Y6
+ VMOVDQU 736(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 768(CX), Y6
+ VMOVDQU 800(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 832(CX), Y6
+ VMOVDQU 864(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 896(CX), Y6
+ VMOVDQU 928(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 960(CX), Y6
+ VMOVDQU 992(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1024(CX), Y6
+ VMOVDQU 1056(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1088(CX), Y6
+ VMOVDQU 1120(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1152(CX), Y6
+ VMOVDQU 1184(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1216(CX), Y6
+ VMOVDQU 1248(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1280(CX), Y6
+ VMOVDQU 1312(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1344(CX), Y6
+ VMOVDQU 1376(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1408(CX), Y6
+ VMOVDQU 1440(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1472(CX), Y6
+ VMOVDQU 1504(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1536(CX), Y6
+ VMOVDQU 1568(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y8
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1600(CX), Y6
+ VMOVDQU 1632(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1664(CX), Y6
+ VMOVDQU 1696(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 1728(CX), Y6
+ VMOVDQU 1760(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 1792(CX), Y6
+ VMOVDQU 1824(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 1856(CX), Y6
+ VMOVDQU 1888(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y8
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 1920(CX), Y6
+ VMOVDQU 1952(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 1984(CX), Y6
+ VMOVDQU 2016(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2048(CX), Y6
+ VMOVDQU 2080(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2112(CX), Y6
+ VMOVDQU 2144(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2176(CX), Y6
+ VMOVDQU 2208(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y8
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2240(CX), Y6
+ VMOVDQU 2272(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2304(CX), Y6
+ VMOVDQU 2336(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2368(CX), Y6
+ VMOVDQU 2400(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2432(CX), Y6
+ VMOVDQU 2464(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2496(CX), Y6
+ VMOVDQU 2528(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (R13), Y8
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2560(CX), Y6
+ VMOVDQU 2592(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2624(CX), Y6
+ VMOVDQU 2656(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 2688(CX), Y6
+ VMOVDQU 2720(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 2752(CX), Y6
+ VMOVDQU 2784(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 2816(CX), Y6
+ VMOVDQU 2848(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Load and process 32 bytes from input 9 to 5 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y8, Y9
+ VPAND Y5, Y8, Y8
+ VPAND Y5, Y9, Y9
+ VMOVDQU 2880(CX), Y6
+ VMOVDQU 2912(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y0)
+ VMOVDQU 2944(CX), Y6
+ VMOVDQU 2976(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y1)
+ VMOVDQU 3008(CX), Y6
+ VMOVDQU 3040(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y2)
+ VMOVDQU 3072(CX), Y6
+ VMOVDQU 3104(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y3)
+ VMOVDQU 3136(CX), Y6
+ VMOVDQU 3168(CX), Y7
+ VPSHUFB Y8, Y6, Y6
+ VPSHUFB Y9, Y7, Y7
+ XOR3WAY( $0x00, Y6, Y7, Y4)
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x5Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x5Xor_end:
+ RET
+
+// func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 131 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x6_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_10x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y0
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y1
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y2
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y3
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y4
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ VPXOR Y7, Y8, Y5
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y9
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2304(CX), Y7
+ VMOVDQU 2336(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2368(CX), Y7
+ VMOVDQU 2400(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2432(CX), Y7
+ VMOVDQU 2464(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2496(CX), Y7
+ VMOVDQU 2528(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2560(CX), Y7
+ VMOVDQU 2592(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2624(CX), Y7
+ VMOVDQU 2656(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y9
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2688(CX), Y7
+ VMOVDQU 2720(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2752(CX), Y7
+ VMOVDQU 2784(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2816(CX), Y7
+ VMOVDQU 2848(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2880(CX), Y7
+ VMOVDQU 2912(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2944(CX), Y7
+ VMOVDQU 2976(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3008(CX), Y7
+ VMOVDQU 3040(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (R13), Y9
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 3072(CX), Y7
+ VMOVDQU 3104(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 3136(CX), Y7
+ VMOVDQU 3168(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 3200(CX), Y7
+ VMOVDQU 3232(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 3264(CX), Y7
+ VMOVDQU 3296(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 3328(CX), Y7
+ VMOVDQU 3360(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3392(CX), Y7
+ VMOVDQU 3424(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 9 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 3456(CX), Y7
+ VMOVDQU 3488(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 3520(CX), Y7
+ VMOVDQU 3552(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 3584(CX), Y7
+ VMOVDQU 3616(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 3648(CX), Y7
+ VMOVDQU 3680(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 3712(CX), Y7
+ VMOVDQU 3744(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3776(CX), Y7
+ VMOVDQU 3808(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x6_loop
+ VZEROUPPER
+
+mulAvxTwo_10x6_end:
+ RET
+
+// func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x6_64(SB), $8-88
+ // Loading 24 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 6 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x6_64_loop
+ VZEROUPPER
+
+mulGFNI_10x6_64_end:
+ RET
+
+// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x6(SB), $8-88
+ // Loading 8 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x6_end:
+ RET
+
+// func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x6_64Xor(SB), $8-88
+ // Loading 24 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x6_64Xor_loop:
+ // Load 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 6 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88
+ // Loading 8 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x6Xor_loop:
+ // Load 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x6Xor_end:
+ RET
+
+// func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 131 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x6Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X6
+ VPBROADCASTB X6, Y6
+
+mulAvxTwo_10x6Xor_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y0
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y1
+ VMOVDQU 64(CX), Y7
+ VMOVDQU 96(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y2
+ VMOVDQU 128(CX), Y7
+ VMOVDQU 160(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y3
+ VMOVDQU 192(CX), Y7
+ VMOVDQU 224(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y4
+ VMOVDQU 256(CX), Y7
+ VMOVDQU 288(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y5
+ VMOVDQU 320(CX), Y7
+ VMOVDQU 352(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 384(CX), Y7
+ VMOVDQU 416(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 448(CX), Y7
+ VMOVDQU 480(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 512(CX), Y7
+ VMOVDQU 544(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 576(CX), Y7
+ VMOVDQU 608(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 640(CX), Y7
+ VMOVDQU 672(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 704(CX), Y7
+ VMOVDQU 736(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 768(CX), Y7
+ VMOVDQU 800(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 832(CX), Y7
+ VMOVDQU 864(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 896(CX), Y7
+ VMOVDQU 928(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 960(CX), Y7
+ VMOVDQU 992(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1024(CX), Y7
+ VMOVDQU 1056(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1088(CX), Y7
+ VMOVDQU 1120(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1152(CX), Y7
+ VMOVDQU 1184(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1216(CX), Y7
+ VMOVDQU 1248(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1280(CX), Y7
+ VMOVDQU 1312(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1344(CX), Y7
+ VMOVDQU 1376(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1408(CX), Y7
+ VMOVDQU 1440(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1472(CX), Y7
+ VMOVDQU 1504(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1536(CX), Y7
+ VMOVDQU 1568(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1600(CX), Y7
+ VMOVDQU 1632(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 1664(CX), Y7
+ VMOVDQU 1696(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 1728(CX), Y7
+ VMOVDQU 1760(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 1792(CX), Y7
+ VMOVDQU 1824(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 1856(CX), Y7
+ VMOVDQU 1888(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 1920(CX), Y7
+ VMOVDQU 1952(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 1984(CX), Y7
+ VMOVDQU 2016(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2048(CX), Y7
+ VMOVDQU 2080(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2112(CX), Y7
+ VMOVDQU 2144(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2176(CX), Y7
+ VMOVDQU 2208(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2240(CX), Y7
+ VMOVDQU 2272(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y9
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2304(CX), Y7
+ VMOVDQU 2336(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2368(CX), Y7
+ VMOVDQU 2400(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2432(CX), Y7
+ VMOVDQU 2464(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2496(CX), Y7
+ VMOVDQU 2528(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2560(CX), Y7
+ VMOVDQU 2592(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 2624(CX), Y7
+ VMOVDQU 2656(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y9
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 2688(CX), Y7
+ VMOVDQU 2720(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 2752(CX), Y7
+ VMOVDQU 2784(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 2816(CX), Y7
+ VMOVDQU 2848(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 2880(CX), Y7
+ VMOVDQU 2912(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 2944(CX), Y7
+ VMOVDQU 2976(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3008(CX), Y7
+ VMOVDQU 3040(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (R13), Y9
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 3072(CX), Y7
+ VMOVDQU 3104(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 3136(CX), Y7
+ VMOVDQU 3168(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 3200(CX), Y7
+ VMOVDQU 3232(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 3264(CX), Y7
+ VMOVDQU 3296(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 3328(CX), Y7
+ VMOVDQU 3360(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3392(CX), Y7
+ VMOVDQU 3424(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Load and process 32 bytes from input 9 to 6 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y9, Y10
+ VPAND Y6, Y9, Y9
+ VPAND Y6, Y10, Y10
+ VMOVDQU 3456(CX), Y7
+ VMOVDQU 3488(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y0)
+ VMOVDQU 3520(CX), Y7
+ VMOVDQU 3552(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y1)
+ VMOVDQU 3584(CX), Y7
+ VMOVDQU 3616(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y2)
+ VMOVDQU 3648(CX), Y7
+ VMOVDQU 3680(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+ VMOVDQU 3712(CX), Y7
+ VMOVDQU 3744(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU 3776(CX), Y7
+ VMOVDQU 3808(CX), Y8
+ VPSHUFB Y9, Y7, Y7
+ VPSHUFB Y10, Y8, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x6Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x6Xor_end:
+ RET
+
+// func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 152 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x7_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_10x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y0
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y1
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y2
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y3
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y4
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y5
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ VPXOR Y8, Y9, Y6
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2688(CX), Y8
+ VMOVDQU 2720(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2752(CX), Y8
+ VMOVDQU 2784(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2816(CX), Y8
+ VMOVDQU 2848(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2880(CX), Y8
+ VMOVDQU 2912(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2944(CX), Y8
+ VMOVDQU 2976(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3008(CX), Y8
+ VMOVDQU 3040(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3072(CX), Y8
+ VMOVDQU 3104(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y10
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3136(CX), Y8
+ VMOVDQU 3168(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3200(CX), Y8
+ VMOVDQU 3232(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3264(CX), Y8
+ VMOVDQU 3296(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3328(CX), Y8
+ VMOVDQU 3360(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3392(CX), Y8
+ VMOVDQU 3424(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3456(CX), Y8
+ VMOVDQU 3488(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3520(CX), Y8
+ VMOVDQU 3552(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (R13), Y10
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3584(CX), Y8
+ VMOVDQU 3616(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3648(CX), Y8
+ VMOVDQU 3680(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3712(CX), Y8
+ VMOVDQU 3744(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3776(CX), Y8
+ VMOVDQU 3808(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3840(CX), Y8
+ VMOVDQU 3872(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3904(CX), Y8
+ VMOVDQU 3936(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3968(CX), Y8
+ VMOVDQU 4000(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 9 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 4032(CX), Y8
+ VMOVDQU 4064(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 4096(CX), Y8
+ VMOVDQU 4128(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 4160(CX), Y8
+ VMOVDQU 4192(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 4224(CX), Y8
+ VMOVDQU 4256(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 4288(CX), Y8
+ VMOVDQU 4320(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 4352(CX), Y8
+ VMOVDQU 4384(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 4416(CX), Y8
+ VMOVDQU 4448(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x7_loop
+ VZEROUPPER
+
+mulAvxTwo_10x7_end:
+ RET
+
+// func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x7_64(SB), $8-88
+ // Loading 23 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 79 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 7 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x7_64_loop
+ VZEROUPPER
+
+mulGFNI_10x7_64_end:
+ RET
+
+// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x7(SB), $8-88
+ // Loading 7 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 79 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x7_end:
+ RET
+
+// func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x7_64Xor(SB), $8-88
+ // Loading 23 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 79 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x7_64Xor_loop:
+ // Load 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z23
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 144(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 7 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88
+ // Loading 7 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 79 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x7Xor_loop:
+ // Load 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x7Xor_end:
+ RET
+
+// func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 152 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x7Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X7
+ VPBROADCASTB X7, Y7
+
+mulAvxTwo_10x7Xor_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y0
+ VMOVDQU (CX), Y8
+ VMOVDQU 32(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y1
+ VMOVDQU 64(CX), Y8
+ VMOVDQU 96(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y2
+ VMOVDQU 128(CX), Y8
+ VMOVDQU 160(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y3
+ VMOVDQU 192(CX), Y8
+ VMOVDQU 224(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y4
+ VMOVDQU 256(CX), Y8
+ VMOVDQU 288(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y5
+ VMOVDQU 320(CX), Y8
+ VMOVDQU 352(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ VMOVDQU 384(CX), Y8
+ VMOVDQU 416(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 448(CX), Y8
+ VMOVDQU 480(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 512(CX), Y8
+ VMOVDQU 544(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 576(CX), Y8
+ VMOVDQU 608(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 640(CX), Y8
+ VMOVDQU 672(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 704(CX), Y8
+ VMOVDQU 736(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 768(CX), Y8
+ VMOVDQU 800(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 832(CX), Y8
+ VMOVDQU 864(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 896(CX), Y8
+ VMOVDQU 928(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 960(CX), Y8
+ VMOVDQU 992(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1024(CX), Y8
+ VMOVDQU 1056(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1088(CX), Y8
+ VMOVDQU 1120(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1152(CX), Y8
+ VMOVDQU 1184(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1216(CX), Y8
+ VMOVDQU 1248(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1280(CX), Y8
+ VMOVDQU 1312(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1344(CX), Y8
+ VMOVDQU 1376(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1408(CX), Y8
+ VMOVDQU 1440(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1472(CX), Y8
+ VMOVDQU 1504(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1536(CX), Y8
+ VMOVDQU 1568(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 1600(CX), Y8
+ VMOVDQU 1632(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 1664(CX), Y8
+ VMOVDQU 1696(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 1728(CX), Y8
+ VMOVDQU 1760(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 1792(CX), Y8
+ VMOVDQU 1824(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 1856(CX), Y8
+ VMOVDQU 1888(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 1920(CX), Y8
+ VMOVDQU 1952(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 1984(CX), Y8
+ VMOVDQU 2016(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2048(CX), Y8
+ VMOVDQU 2080(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2112(CX), Y8
+ VMOVDQU 2144(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2176(CX), Y8
+ VMOVDQU 2208(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2240(CX), Y8
+ VMOVDQU 2272(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2304(CX), Y8
+ VMOVDQU 2336(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2368(CX), Y8
+ VMOVDQU 2400(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2432(CX), Y8
+ VMOVDQU 2464(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2496(CX), Y8
+ VMOVDQU 2528(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 2560(CX), Y8
+ VMOVDQU 2592(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 2624(CX), Y8
+ VMOVDQU 2656(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 2688(CX), Y8
+ VMOVDQU 2720(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 2752(CX), Y8
+ VMOVDQU 2784(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 2816(CX), Y8
+ VMOVDQU 2848(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 2880(CX), Y8
+ VMOVDQU 2912(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 2944(CX), Y8
+ VMOVDQU 2976(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3008(CX), Y8
+ VMOVDQU 3040(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3072(CX), Y8
+ VMOVDQU 3104(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y10
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3136(CX), Y8
+ VMOVDQU 3168(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3200(CX), Y8
+ VMOVDQU 3232(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3264(CX), Y8
+ VMOVDQU 3296(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3328(CX), Y8
+ VMOVDQU 3360(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3392(CX), Y8
+ VMOVDQU 3424(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3456(CX), Y8
+ VMOVDQU 3488(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3520(CX), Y8
+ VMOVDQU 3552(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (R13), Y10
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 3584(CX), Y8
+ VMOVDQU 3616(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 3648(CX), Y8
+ VMOVDQU 3680(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 3712(CX), Y8
+ VMOVDQU 3744(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 3776(CX), Y8
+ VMOVDQU 3808(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 3840(CX), Y8
+ VMOVDQU 3872(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 3904(CX), Y8
+ VMOVDQU 3936(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 3968(CX), Y8
+ VMOVDQU 4000(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Load and process 32 bytes from input 9 to 7 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y10, Y11
+ VPAND Y7, Y10, Y10
+ VPAND Y7, Y11, Y11
+ VMOVDQU 4032(CX), Y8
+ VMOVDQU 4064(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y0)
+ VMOVDQU 4096(CX), Y8
+ VMOVDQU 4128(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y1)
+ VMOVDQU 4160(CX), Y8
+ VMOVDQU 4192(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y2)
+ VMOVDQU 4224(CX), Y8
+ VMOVDQU 4256(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y3)
+ VMOVDQU 4288(CX), Y8
+ VMOVDQU 4320(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y4)
+ VMOVDQU 4352(CX), Y8
+ VMOVDQU 4384(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y5)
+ VMOVDQU 4416(CX), Y8
+ VMOVDQU 4448(CX), Y9
+ VPSHUFB Y10, Y8, Y8
+ VPSHUFB Y11, Y9, Y9
+ XOR3WAY( $0x00, Y8, Y9, Y6)
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x7Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x7Xor_end:
+ RET
+
+// func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 173 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x8_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_10x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y0
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y1
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y2
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y3
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y4
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y5
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y6
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ VPXOR Y9, Y10, Y7
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3072(CX), Y9
+ VMOVDQU 3104(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3136(CX), Y9
+ VMOVDQU 3168(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3200(CX), Y9
+ VMOVDQU 3232(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3264(CX), Y9
+ VMOVDQU 3296(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3328(CX), Y9
+ VMOVDQU 3360(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3392(CX), Y9
+ VMOVDQU 3424(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3456(CX), Y9
+ VMOVDQU 3488(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3520(CX), Y9
+ VMOVDQU 3552(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y11
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3584(CX), Y9
+ VMOVDQU 3616(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3648(CX), Y9
+ VMOVDQU 3680(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3712(CX), Y9
+ VMOVDQU 3744(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3776(CX), Y9
+ VMOVDQU 3808(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3840(CX), Y9
+ VMOVDQU 3872(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3904(CX), Y9
+ VMOVDQU 3936(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3968(CX), Y9
+ VMOVDQU 4000(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4032(CX), Y9
+ VMOVDQU 4064(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (R13), Y11
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 4096(CX), Y9
+ VMOVDQU 4128(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 4160(CX), Y9
+ VMOVDQU 4192(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 4224(CX), Y9
+ VMOVDQU 4256(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 4288(CX), Y9
+ VMOVDQU 4320(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 4352(CX), Y9
+ VMOVDQU 4384(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 4416(CX), Y9
+ VMOVDQU 4448(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 4480(CX), Y9
+ VMOVDQU 4512(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4544(CX), Y9
+ VMOVDQU 4576(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 9 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 4608(CX), Y9
+ VMOVDQU 4640(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 4672(CX), Y9
+ VMOVDQU 4704(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 4736(CX), Y9
+ VMOVDQU 4768(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 4800(CX), Y9
+ VMOVDQU 4832(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 4864(CX), Y9
+ VMOVDQU 4896(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 4928(CX), Y9
+ VMOVDQU 4960(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 4992(CX), Y9
+ VMOVDQU 5024(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 5056(CX), Y9
+ VMOVDQU 5088(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x8_loop
+ VZEROUPPER
+
+mulAvxTwo_10x8_end:
+ RET
+
+// func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x8_64(SB), $8-88
+ // Loading 22 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 8 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x8_64_loop
+ VZEROUPPER
+
+mulGFNI_10x8_64_end:
+ RET
+
+// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x8(SB), $8-88
+ // Loading 6 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x8_end:
+ RET
+
+// func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x8_64Xor(SB), $8-88
+ // Loading 22 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z22
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z23
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 144(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 168(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 8 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88
+ // Loading 6 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 168(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x8Xor_end:
+ RET
+
+// func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 173 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x8Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X8
+ VPBROADCASTB X8, Y8
+
+mulAvxTwo_10x8Xor_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y0
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y1
+ VMOVDQU 64(CX), Y9
+ VMOVDQU 96(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y2
+ VMOVDQU 128(CX), Y9
+ VMOVDQU 160(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y3
+ VMOVDQU 192(CX), Y9
+ VMOVDQU 224(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y4
+ VMOVDQU 256(CX), Y9
+ VMOVDQU 288(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y5
+ VMOVDQU 320(CX), Y9
+ VMOVDQU 352(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ VMOVDQU 384(CX), Y9
+ VMOVDQU 416(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ MOVQ 168(R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ VMOVDQU 448(CX), Y9
+ VMOVDQU 480(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 512(CX), Y9
+ VMOVDQU 544(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 576(CX), Y9
+ VMOVDQU 608(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 640(CX), Y9
+ VMOVDQU 672(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 704(CX), Y9
+ VMOVDQU 736(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 768(CX), Y9
+ VMOVDQU 800(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 832(CX), Y9
+ VMOVDQU 864(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 896(CX), Y9
+ VMOVDQU 928(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 960(CX), Y9
+ VMOVDQU 992(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1024(CX), Y9
+ VMOVDQU 1056(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1088(CX), Y9
+ VMOVDQU 1120(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1152(CX), Y9
+ VMOVDQU 1184(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1216(CX), Y9
+ VMOVDQU 1248(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1280(CX), Y9
+ VMOVDQU 1312(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1344(CX), Y9
+ VMOVDQU 1376(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1408(CX), Y9
+ VMOVDQU 1440(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1472(CX), Y9
+ VMOVDQU 1504(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 1536(CX), Y9
+ VMOVDQU 1568(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 1600(CX), Y9
+ VMOVDQU 1632(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 1664(CX), Y9
+ VMOVDQU 1696(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 1728(CX), Y9
+ VMOVDQU 1760(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 1792(CX), Y9
+ VMOVDQU 1824(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 1856(CX), Y9
+ VMOVDQU 1888(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 1920(CX), Y9
+ VMOVDQU 1952(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 1984(CX), Y9
+ VMOVDQU 2016(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2048(CX), Y9
+ VMOVDQU 2080(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2112(CX), Y9
+ VMOVDQU 2144(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2176(CX), Y9
+ VMOVDQU 2208(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2240(CX), Y9
+ VMOVDQU 2272(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2304(CX), Y9
+ VMOVDQU 2336(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2368(CX), Y9
+ VMOVDQU 2400(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2432(CX), Y9
+ VMOVDQU 2464(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 2496(CX), Y9
+ VMOVDQU 2528(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 2560(CX), Y9
+ VMOVDQU 2592(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 2624(CX), Y9
+ VMOVDQU 2656(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 2688(CX), Y9
+ VMOVDQU 2720(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 2752(CX), Y9
+ VMOVDQU 2784(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 2816(CX), Y9
+ VMOVDQU 2848(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 2880(CX), Y9
+ VMOVDQU 2912(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 2944(CX), Y9
+ VMOVDQU 2976(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3008(CX), Y9
+ VMOVDQU 3040(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3072(CX), Y9
+ VMOVDQU 3104(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3136(CX), Y9
+ VMOVDQU 3168(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3200(CX), Y9
+ VMOVDQU 3232(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3264(CX), Y9
+ VMOVDQU 3296(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3328(CX), Y9
+ VMOVDQU 3360(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3392(CX), Y9
+ VMOVDQU 3424(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3456(CX), Y9
+ VMOVDQU 3488(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 3520(CX), Y9
+ VMOVDQU 3552(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y11
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 3584(CX), Y9
+ VMOVDQU 3616(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 3648(CX), Y9
+ VMOVDQU 3680(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 3712(CX), Y9
+ VMOVDQU 3744(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 3776(CX), Y9
+ VMOVDQU 3808(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 3840(CX), Y9
+ VMOVDQU 3872(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 3904(CX), Y9
+ VMOVDQU 3936(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 3968(CX), Y9
+ VMOVDQU 4000(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4032(CX), Y9
+ VMOVDQU 4064(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (R13), Y11
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 4096(CX), Y9
+ VMOVDQU 4128(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 4160(CX), Y9
+ VMOVDQU 4192(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 4224(CX), Y9
+ VMOVDQU 4256(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 4288(CX), Y9
+ VMOVDQU 4320(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 4352(CX), Y9
+ VMOVDQU 4384(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 4416(CX), Y9
+ VMOVDQU 4448(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 4480(CX), Y9
+ VMOVDQU 4512(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 4544(CX), Y9
+ VMOVDQU 4576(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Load and process 32 bytes from input 9 to 8 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y11, Y12
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y12, Y12
+ VMOVDQU 4608(CX), Y9
+ VMOVDQU 4640(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y0)
+ VMOVDQU 4672(CX), Y9
+ VMOVDQU 4704(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y1)
+ VMOVDQU 4736(CX), Y9
+ VMOVDQU 4768(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VMOVDQU 4800(CX), Y9
+ VMOVDQU 4832(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y3)
+ VMOVDQU 4864(CX), Y9
+ VMOVDQU 4896(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU 4928(CX), Y9
+ VMOVDQU 4960(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+ VMOVDQU 4992(CX), Y9
+ VMOVDQU 5024(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VMOVDQU 5056(CX), Y9
+ VMOVDQU 5088(CX), Y10
+ VPSHUFB Y11, Y9, Y9
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x8Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x8Xor_end:
+ RET
+
+// func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 194 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x9_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_10x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y0
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y1
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y2
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y3
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y4
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y5
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y6
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y7
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ VPXOR Y10, Y11, Y8
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y12
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y12
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 3456(CX), Y10
+ VMOVDQU 3488(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 3520(CX), Y10
+ VMOVDQU 3552(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3584(CX), Y10
+ VMOVDQU 3616(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3648(CX), Y10
+ VMOVDQU 3680(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3712(CX), Y10
+ VMOVDQU 3744(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3776(CX), Y10
+ VMOVDQU 3808(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3840(CX), Y10
+ VMOVDQU 3872(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3904(CX), Y10
+ VMOVDQU 3936(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3968(CX), Y10
+ VMOVDQU 4000(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y12
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4032(CX), Y10
+ VMOVDQU 4064(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4096(CX), Y10
+ VMOVDQU 4128(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4160(CX), Y10
+ VMOVDQU 4192(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4224(CX), Y10
+ VMOVDQU 4256(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4288(CX), Y10
+ VMOVDQU 4320(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4352(CX), Y10
+ VMOVDQU 4384(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4416(CX), Y10
+ VMOVDQU 4448(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 4480(CX), Y10
+ VMOVDQU 4512(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 4544(CX), Y10
+ VMOVDQU 4576(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (R13), Y12
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4608(CX), Y10
+ VMOVDQU 4640(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4672(CX), Y10
+ VMOVDQU 4704(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4736(CX), Y10
+ VMOVDQU 4768(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4800(CX), Y10
+ VMOVDQU 4832(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4864(CX), Y10
+ VMOVDQU 4896(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4928(CX), Y10
+ VMOVDQU 4960(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4992(CX), Y10
+ VMOVDQU 5024(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 5056(CX), Y10
+ VMOVDQU 5088(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 5120(CX), Y10
+ VMOVDQU 5152(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 9 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 5184(CX), Y10
+ VMOVDQU 5216(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 5248(CX), Y10
+ VMOVDQU 5280(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 5312(CX), Y10
+ VMOVDQU 5344(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 5376(CX), Y10
+ VMOVDQU 5408(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 5440(CX), Y10
+ VMOVDQU 5472(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 5504(CX), Y10
+ VMOVDQU 5536(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 5568(CX), Y10
+ VMOVDQU 5600(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 5632(CX), Y10
+ VMOVDQU 5664(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 5696(CX), Y10
+ VMOVDQU 5728(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x9_loop
+ VZEROUPPER
+
+mulAvxTwo_10x9_end:
+ RET
+
+// func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x9_64(SB), $8-88
+ // Loading 21 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 101 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 9 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z21, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x9_64_loop
+ VZEROUPPER
+
+mulGFNI_10x9_64_end:
+ RET
+
+// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x9(SB), $8-88
+ // Loading 5 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 101 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x9_end:
+ RET
+
+// func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x9_64Xor(SB), $8-88
+ // Loading 21 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 101 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z21
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z22
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z23
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 144(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 168(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 192(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 9 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z21, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88
+ // Loading 5 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 101 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y5
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 168(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 192(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x9Xor_end:
+ RET
+
+// func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 194 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x9Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X9
+ VPBROADCASTB X9, Y9
+
+mulAvxTwo_10x9Xor_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y0
+ VMOVDQU (CX), Y10
+ VMOVDQU 32(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y1
+ VMOVDQU 64(CX), Y10
+ VMOVDQU 96(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y2
+ VMOVDQU 128(CX), Y10
+ VMOVDQU 160(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y3
+ VMOVDQU 192(CX), Y10
+ VMOVDQU 224(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y4
+ VMOVDQU 256(CX), Y10
+ VMOVDQU 288(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y5
+ VMOVDQU 320(CX), Y10
+ VMOVDQU 352(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ VMOVDQU 384(CX), Y10
+ VMOVDQU 416(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ MOVQ 168(R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ VMOVDQU 448(CX), Y10
+ VMOVDQU 480(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ MOVQ 192(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ VMOVDQU 512(CX), Y10
+ VMOVDQU 544(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 576(CX), Y10
+ VMOVDQU 608(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 640(CX), Y10
+ VMOVDQU 672(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 704(CX), Y10
+ VMOVDQU 736(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 768(CX), Y10
+ VMOVDQU 800(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 832(CX), Y10
+ VMOVDQU 864(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 896(CX), Y10
+ VMOVDQU 928(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 960(CX), Y10
+ VMOVDQU 992(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1024(CX), Y10
+ VMOVDQU 1056(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1088(CX), Y10
+ VMOVDQU 1120(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1152(CX), Y10
+ VMOVDQU 1184(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1216(CX), Y10
+ VMOVDQU 1248(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1280(CX), Y10
+ VMOVDQU 1312(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1344(CX), Y10
+ VMOVDQU 1376(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1408(CX), Y10
+ VMOVDQU 1440(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 1472(CX), Y10
+ VMOVDQU 1504(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 1536(CX), Y10
+ VMOVDQU 1568(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 1600(CX), Y10
+ VMOVDQU 1632(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 1664(CX), Y10
+ VMOVDQU 1696(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y12
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 1728(CX), Y10
+ VMOVDQU 1760(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 1792(CX), Y10
+ VMOVDQU 1824(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 1856(CX), Y10
+ VMOVDQU 1888(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 1920(CX), Y10
+ VMOVDQU 1952(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 1984(CX), Y10
+ VMOVDQU 2016(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2048(CX), Y10
+ VMOVDQU 2080(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2112(CX), Y10
+ VMOVDQU 2144(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2176(CX), Y10
+ VMOVDQU 2208(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2240(CX), Y10
+ VMOVDQU 2272(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y12
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2304(CX), Y10
+ VMOVDQU 2336(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2368(CX), Y10
+ VMOVDQU 2400(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 2432(CX), Y10
+ VMOVDQU 2464(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 2496(CX), Y10
+ VMOVDQU 2528(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 2560(CX), Y10
+ VMOVDQU 2592(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 2624(CX), Y10
+ VMOVDQU 2656(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 2688(CX), Y10
+ VMOVDQU 2720(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 2752(CX), Y10
+ VMOVDQU 2784(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 2816(CX), Y10
+ VMOVDQU 2848(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y12
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 2880(CX), Y10
+ VMOVDQU 2912(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 2944(CX), Y10
+ VMOVDQU 2976(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3008(CX), Y10
+ VMOVDQU 3040(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3072(CX), Y10
+ VMOVDQU 3104(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3136(CX), Y10
+ VMOVDQU 3168(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3200(CX), Y10
+ VMOVDQU 3232(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3264(CX), Y10
+ VMOVDQU 3296(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3328(CX), Y10
+ VMOVDQU 3360(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3392(CX), Y10
+ VMOVDQU 3424(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y12
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 3456(CX), Y10
+ VMOVDQU 3488(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 3520(CX), Y10
+ VMOVDQU 3552(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 3584(CX), Y10
+ VMOVDQU 3616(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 3648(CX), Y10
+ VMOVDQU 3680(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 3712(CX), Y10
+ VMOVDQU 3744(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 3776(CX), Y10
+ VMOVDQU 3808(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 3840(CX), Y10
+ VMOVDQU 3872(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 3904(CX), Y10
+ VMOVDQU 3936(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 3968(CX), Y10
+ VMOVDQU 4000(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y12
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4032(CX), Y10
+ VMOVDQU 4064(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4096(CX), Y10
+ VMOVDQU 4128(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4160(CX), Y10
+ VMOVDQU 4192(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4224(CX), Y10
+ VMOVDQU 4256(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4288(CX), Y10
+ VMOVDQU 4320(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4352(CX), Y10
+ VMOVDQU 4384(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4416(CX), Y10
+ VMOVDQU 4448(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 4480(CX), Y10
+ VMOVDQU 4512(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 4544(CX), Y10
+ VMOVDQU 4576(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (R13), Y12
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 4608(CX), Y10
+ VMOVDQU 4640(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 4672(CX), Y10
+ VMOVDQU 4704(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 4736(CX), Y10
+ VMOVDQU 4768(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 4800(CX), Y10
+ VMOVDQU 4832(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 4864(CX), Y10
+ VMOVDQU 4896(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 4928(CX), Y10
+ VMOVDQU 4960(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 4992(CX), Y10
+ VMOVDQU 5024(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 5056(CX), Y10
+ VMOVDQU 5088(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 5120(CX), Y10
+ VMOVDQU 5152(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Load and process 32 bytes from input 9 to 9 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y12, Y13
+ VPAND Y9, Y12, Y12
+ VPAND Y9, Y13, Y13
+ VMOVDQU 5184(CX), Y10
+ VMOVDQU 5216(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y0)
+ VMOVDQU 5248(CX), Y10
+ VMOVDQU 5280(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y1)
+ VMOVDQU 5312(CX), Y10
+ VMOVDQU 5344(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y2)
+ VMOVDQU 5376(CX), Y10
+ VMOVDQU 5408(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y3)
+ VMOVDQU 5440(CX), Y10
+ VMOVDQU 5472(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y4)
+ VMOVDQU 5504(CX), Y10
+ VMOVDQU 5536(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y5)
+ VMOVDQU 5568(CX), Y10
+ VMOVDQU 5600(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y6)
+ VMOVDQU 5632(CX), Y10
+ VMOVDQU 5664(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y7)
+ VMOVDQU 5696(CX), Y10
+ VMOVDQU 5728(CX), Y11
+ VPSHUFB Y12, Y10, Y10
+ VPSHUFB Y13, Y11, Y11
+ XOR3WAY( $0x00, Y10, Y11, Y8)
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x9Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x9Xor_end:
+ RET
+
+// func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 215 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x10_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_10x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y0
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y1
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y2
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y3
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y4
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y5
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y6
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y7
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y8
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ VPXOR Y11, Y12, Y9
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y13
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y13
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3840(CX), Y11
+ VMOVDQU 3872(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3904(CX), Y11
+ VMOVDQU 3936(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3968(CX), Y11
+ VMOVDQU 4000(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4032(CX), Y11
+ VMOVDQU 4064(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4096(CX), Y11
+ VMOVDQU 4128(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4160(CX), Y11
+ VMOVDQU 4192(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4224(CX), Y11
+ VMOVDQU 4256(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4288(CX), Y11
+ VMOVDQU 4320(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4352(CX), Y11
+ VMOVDQU 4384(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 4416(CX), Y11
+ VMOVDQU 4448(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y13
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 4480(CX), Y11
+ VMOVDQU 4512(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 4544(CX), Y11
+ VMOVDQU 4576(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 4608(CX), Y11
+ VMOVDQU 4640(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4672(CX), Y11
+ VMOVDQU 4704(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4736(CX), Y11
+ VMOVDQU 4768(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4800(CX), Y11
+ VMOVDQU 4832(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4864(CX), Y11
+ VMOVDQU 4896(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4928(CX), Y11
+ VMOVDQU 4960(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4992(CX), Y11
+ VMOVDQU 5024(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5056(CX), Y11
+ VMOVDQU 5088(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (R13), Y13
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 5120(CX), Y11
+ VMOVDQU 5152(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 5184(CX), Y11
+ VMOVDQU 5216(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 5248(CX), Y11
+ VMOVDQU 5280(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 5312(CX), Y11
+ VMOVDQU 5344(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 5376(CX), Y11
+ VMOVDQU 5408(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 5440(CX), Y11
+ VMOVDQU 5472(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 5504(CX), Y11
+ VMOVDQU 5536(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 5568(CX), Y11
+ VMOVDQU 5600(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 5632(CX), Y11
+ VMOVDQU 5664(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5696(CX), Y11
+ VMOVDQU 5728(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 9 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 5760(CX), Y11
+ VMOVDQU 5792(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 5824(CX), Y11
+ VMOVDQU 5856(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 5888(CX), Y11
+ VMOVDQU 5920(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 5952(CX), Y11
+ VMOVDQU 5984(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 6016(CX), Y11
+ VMOVDQU 6048(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 6080(CX), Y11
+ VMOVDQU 6112(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 6144(CX), Y11
+ VMOVDQU 6176(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 6208(CX), Y11
+ VMOVDQU 6240(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 6272(CX), Y11
+ VMOVDQU 6304(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 6336(CX), Y11
+ VMOVDQU 6368(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x10_loop
+ VZEROUPPER
+
+mulAvxTwo_10x10_end:
+ RET
+
+// func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x10_64(SB), $8-88
+ // Loading 20 of 100 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 112 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 10 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z20, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z21, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x10_64_loop
+ VZEROUPPER
+
+mulGFNI_10x10_64_end:
+ RET
+
+// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x10(SB), $8-88
+ // Loading 4 of 100 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 112 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 720(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 728(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 736(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 744(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 752(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 760(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 768(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 776(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 784(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 792(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x10_end:
+ RET
+
+// func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x10_64Xor(SB), $8-88
+ // Loading 20 of 100 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 112 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z20
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z21
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z22
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z23
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 144(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 168(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 192(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 216(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 10 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z20, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z21, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88
+ // Loading 4 of 100 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 112 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y4
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y5
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 168(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 192(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 216(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 720(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 728(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 736(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 744(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 752(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 760(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 768(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 776(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 784(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 792(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x10Xor_end:
+ RET
+
+// func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88
+ // Loading no tables to registers
+ // Destination kept on stack
+ // Full registers estimated 215 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxTwo_10x10Xor_end
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+ MOVQ $0x0000000f, BP
+ MOVQ BP, X10
+ VPBROADCASTB X10, Y10
+
+mulAvxTwo_10x10Xor_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y13
+ ADDQ $0x20, BX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y0
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y1
+ VMOVDQU 64(CX), Y11
+ VMOVDQU 96(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y2
+ VMOVDQU 128(CX), Y11
+ VMOVDQU 160(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y3
+ VMOVDQU 192(CX), Y11
+ VMOVDQU 224(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y4
+ VMOVDQU 256(CX), Y11
+ VMOVDQU 288(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y5
+ VMOVDQU 320(CX), Y11
+ VMOVDQU 352(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ VMOVDQU 384(CX), Y11
+ VMOVDQU 416(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ MOVQ 168(R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ VMOVDQU 448(CX), Y11
+ VMOVDQU 480(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ MOVQ 192(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ VMOVDQU 512(CX), Y11
+ VMOVDQU 544(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ MOVQ 216(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ VMOVDQU 576(CX), Y11
+ VMOVDQU 608(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y13
+ ADDQ $0x20, SI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 640(CX), Y11
+ VMOVDQU 672(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 704(CX), Y11
+ VMOVDQU 736(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 768(CX), Y11
+ VMOVDQU 800(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 832(CX), Y11
+ VMOVDQU 864(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 896(CX), Y11
+ VMOVDQU 928(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 960(CX), Y11
+ VMOVDQU 992(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1024(CX), Y11
+ VMOVDQU 1056(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1088(CX), Y11
+ VMOVDQU 1120(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1152(CX), Y11
+ VMOVDQU 1184(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1216(CX), Y11
+ VMOVDQU 1248(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y13
+ ADDQ $0x20, DI
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1280(CX), Y11
+ VMOVDQU 1312(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1344(CX), Y11
+ VMOVDQU 1376(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 1408(CX), Y11
+ VMOVDQU 1440(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 1472(CX), Y11
+ VMOVDQU 1504(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 1536(CX), Y11
+ VMOVDQU 1568(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 1600(CX), Y11
+ VMOVDQU 1632(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 1664(CX), Y11
+ VMOVDQU 1696(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 1728(CX), Y11
+ VMOVDQU 1760(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 1792(CX), Y11
+ VMOVDQU 1824(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 1856(CX), Y11
+ VMOVDQU 1888(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y13
+ ADDQ $0x20, R8
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 1920(CX), Y11
+ VMOVDQU 1952(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 1984(CX), Y11
+ VMOVDQU 2016(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2048(CX), Y11
+ VMOVDQU 2080(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2112(CX), Y11
+ VMOVDQU 2144(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2176(CX), Y11
+ VMOVDQU 2208(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2240(CX), Y11
+ VMOVDQU 2272(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2304(CX), Y11
+ VMOVDQU 2336(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 2368(CX), Y11
+ VMOVDQU 2400(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 2432(CX), Y11
+ VMOVDQU 2464(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 2496(CX), Y11
+ VMOVDQU 2528(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y13
+ ADDQ $0x20, R9
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 2560(CX), Y11
+ VMOVDQU 2592(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 2624(CX), Y11
+ VMOVDQU 2656(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 2688(CX), Y11
+ VMOVDQU 2720(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 2752(CX), Y11
+ VMOVDQU 2784(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 2816(CX), Y11
+ VMOVDQU 2848(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 2880(CX), Y11
+ VMOVDQU 2912(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 2944(CX), Y11
+ VMOVDQU 2976(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3008(CX), Y11
+ VMOVDQU 3040(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3072(CX), Y11
+ VMOVDQU 3104(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3136(CX), Y11
+ VMOVDQU 3168(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y13
+ ADDQ $0x20, R10
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3200(CX), Y11
+ VMOVDQU 3232(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3264(CX), Y11
+ VMOVDQU 3296(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3328(CX), Y11
+ VMOVDQU 3360(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 3392(CX), Y11
+ VMOVDQU 3424(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 3456(CX), Y11
+ VMOVDQU 3488(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 3520(CX), Y11
+ VMOVDQU 3552(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 3584(CX), Y11
+ VMOVDQU 3616(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 3648(CX), Y11
+ VMOVDQU 3680(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 3712(CX), Y11
+ VMOVDQU 3744(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 3776(CX), Y11
+ VMOVDQU 3808(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y13
+ ADDQ $0x20, R11
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 3840(CX), Y11
+ VMOVDQU 3872(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 3904(CX), Y11
+ VMOVDQU 3936(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 3968(CX), Y11
+ VMOVDQU 4000(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4032(CX), Y11
+ VMOVDQU 4064(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4096(CX), Y11
+ VMOVDQU 4128(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4160(CX), Y11
+ VMOVDQU 4192(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4224(CX), Y11
+ VMOVDQU 4256(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4288(CX), Y11
+ VMOVDQU 4320(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4352(CX), Y11
+ VMOVDQU 4384(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 4416(CX), Y11
+ VMOVDQU 4448(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y13
+ ADDQ $0x20, R12
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 4480(CX), Y11
+ VMOVDQU 4512(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 4544(CX), Y11
+ VMOVDQU 4576(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 4608(CX), Y11
+ VMOVDQU 4640(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 4672(CX), Y11
+ VMOVDQU 4704(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 4736(CX), Y11
+ VMOVDQU 4768(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 4800(CX), Y11
+ VMOVDQU 4832(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 4864(CX), Y11
+ VMOVDQU 4896(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 4928(CX), Y11
+ VMOVDQU 4960(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 4992(CX), Y11
+ VMOVDQU 5024(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5056(CX), Y11
+ VMOVDQU 5088(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (R13), Y13
+ ADDQ $0x20, R13
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 5120(CX), Y11
+ VMOVDQU 5152(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 5184(CX), Y11
+ VMOVDQU 5216(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 5248(CX), Y11
+ VMOVDQU 5280(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 5312(CX), Y11
+ VMOVDQU 5344(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 5376(CX), Y11
+ VMOVDQU 5408(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 5440(CX), Y11
+ VMOVDQU 5472(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 5504(CX), Y11
+ VMOVDQU 5536(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 5568(CX), Y11
+ VMOVDQU 5600(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 5632(CX), Y11
+ VMOVDQU 5664(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 5696(CX), Y11
+ VMOVDQU 5728(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Load and process 32 bytes from input 9 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VPSRLQ $0x04, Y13, Y14
+ VPAND Y10, Y13, Y13
+ VPAND Y10, Y14, Y14
+ VMOVDQU 5760(CX), Y11
+ VMOVDQU 5792(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y0)
+ VMOVDQU 5824(CX), Y11
+ VMOVDQU 5856(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y1)
+ VMOVDQU 5888(CX), Y11
+ VMOVDQU 5920(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y2)
+ VMOVDQU 5952(CX), Y11
+ VMOVDQU 5984(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+ VMOVDQU 6016(CX), Y11
+ VMOVDQU 6048(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VMOVDQU 6080(CX), Y11
+ VMOVDQU 6112(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+ VMOVDQU 6144(CX), Y11
+ VMOVDQU 6176(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU 6208(CX), Y11
+ VMOVDQU 6240(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+ VMOVDQU 6272(CX), Y11
+ VMOVDQU 6304(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU 6336(CX), Y11
+ VMOVDQU 6368(CX), Y12
+ VPSHUFB Y13, Y11, Y11
+ VPSHUFB Y14, Y12, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y0, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y1, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y2, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y3, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxTwo_10x10Xor_loop
+ VZEROUPPER
+
+mulAvxTwo_10x10Xor_end:
+ RET
+
+// func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT2_avx2(SB), NOSPLIT, $0-56
+ MOVQ table+48(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 64(AX), Y1
+ VBROADCASTI128 16(AX), Y2
+ VBROADCASTI128 80(AX), Y3
+ VBROADCASTI128 32(AX), Y4
+ VBROADCASTI128 96(AX), Y5
+ VBROADCASTI128 48(AX), Y6
+ VBROADCASTI128 112(AX), Y7
+ MOVQ x_len+8(FP), AX
+ MOVQ x_base+0(FP), CX
+ MOVQ y_base+24(FP), DX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X8
+ VPBROADCASTB X8, Y8
+
+loop:
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y12
+ VPXOR Y11, Y9, Y11
+ VPXOR Y12, Y10, Y12
+ VMOVDQU Y11, (DX)
+ VMOVDQU Y12, 32(DX)
+ VPSRLQ $0x04, Y11, Y13
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y13, Y13
+ VPSHUFB Y11, Y0, Y14
+ VPSHUFB Y11, Y1, Y11
+ VPSHUFB Y13, Y2, Y15
+ VPSHUFB Y13, Y3, Y13
+ VPXOR Y14, Y15, Y14
+ VPXOR Y11, Y13, Y11
+ VPAND Y12, Y8, Y13
+ VPSRLQ $0x04, Y12, Y12
+ VPAND Y8, Y12, Y12
+ VPSHUFB Y13, Y4, Y15
+ VPSHUFB Y13, Y5, Y13
+ VPXOR Y14, Y15, Y14
+ VPXOR Y11, Y13, Y11
+ VPSHUFB Y12, Y6, Y15
+ VPSHUFB Y12, Y7, Y13
+ XOR3WAY( $0x00, Y14, Y15, Y9)
+ XOR3WAY( $0x00, Y11, Y13, Y10)
+ VMOVDQU Y9, (CX)
+ VMOVDQU Y10, 32(CX)
+ ADDQ $0x40, CX
+ ADDQ $0x40, DX
+ SUBQ $0x40, AX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT2_avx2(SB), NOSPLIT, $0-56
+ MOVQ table+48(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 64(AX), Y1
+ VBROADCASTI128 16(AX), Y2
+ VBROADCASTI128 80(AX), Y3
+ VBROADCASTI128 32(AX), Y4
+ VBROADCASTI128 96(AX), Y5
+ VBROADCASTI128 48(AX), Y6
+ VBROADCASTI128 112(AX), Y7
+ MOVQ x_len+8(FP), AX
+ MOVQ x_base+0(FP), CX
+ MOVQ y_base+24(FP), DX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X8
+ VPBROADCASTB X8, Y8
+
+loop:
+ VMOVDQU (CX), Y9
+ VMOVDQU 32(CX), Y10
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y12
+ VPSRLQ $0x04, Y11, Y13
+ VPAND Y8, Y11, Y11
+ VPAND Y8, Y13, Y13
+ VPSHUFB Y11, Y0, Y14
+ VPSHUFB Y11, Y1, Y11
+ VPSHUFB Y13, Y2, Y15
+ VPSHUFB Y13, Y3, Y13
+ VPXOR Y14, Y15, Y14
+ VPXOR Y11, Y13, Y11
+ VPAND Y12, Y8, Y13
+ VPSRLQ $0x04, Y12, Y12
+ VPAND Y8, Y12, Y12
+ VPSHUFB Y13, Y4, Y15
+ VPSHUFB Y13, Y5, Y13
+ VPXOR Y14, Y15, Y14
+ VPXOR Y11, Y13, Y11
+ VPSHUFB Y12, Y6, Y15
+ VPSHUFB Y12, Y7, Y13
+ XOR3WAY( $0x00, Y14, Y15, Y9)
+ XOR3WAY( $0x00, Y11, Y13, Y10)
+ VMOVDQU Y9, (CX)
+ VMOVDQU Y10, 32(CX)
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y12
+ VPXOR Y11, Y9, Y11
+ VPXOR Y12, Y10, Y12
+ VMOVDQU Y11, (DX)
+ VMOVDQU Y12, 32(DX)
+ ADDQ $0x40, CX
+ ADDQ $0x40, DX
+ SUBQ $0x40, AX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func mulgf16_avx2(x []byte, y []byte, table *[128]uint8)
+// Requires: AVX, AVX2, SSE2
+TEXT ·mulgf16_avx2(SB), NOSPLIT, $0-56
+ MOVQ table+48(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 64(AX), Y1
+ VBROADCASTI128 16(AX), Y2
+ VBROADCASTI128 80(AX), Y3
+ VBROADCASTI128 32(AX), Y4
+ VBROADCASTI128 96(AX), Y5
+ VBROADCASTI128 48(AX), Y6
+ VBROADCASTI128 112(AX), Y7
+ MOVQ x_len+8(FP), AX
+ MOVQ x_base+0(FP), CX
+ MOVQ y_base+24(FP), DX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X8
+ VPBROADCASTB X8, Y8
+
+loop:
+ VMOVDQU (DX), Y9
+ VMOVDQU 32(DX), Y10
+ VPSRLQ $0x04, Y9, Y11
+ VPAND Y8, Y9, Y9
+ VPAND Y8, Y11, Y11
+ VPSHUFB Y9, Y0, Y12
+ VPSHUFB Y9, Y1, Y9
+ VPSHUFB Y11, Y2, Y13
+ VPSHUFB Y11, Y3, Y11
+ VPXOR Y12, Y13, Y12
+ VPXOR Y9, Y11, Y9
+ VPAND Y10, Y8, Y11
+ VPSRLQ $0x04, Y10, Y10
+ VPAND Y8, Y10, Y10
+ VPSHUFB Y11, Y4, Y13
+ VPSHUFB Y11, Y5, Y11
+ VPXOR Y12, Y13, Y12
+ VPXOR Y9, Y11, Y9
+ VPSHUFB Y10, Y6, Y13
+ VPSHUFB Y10, Y7, Y11
+ VPXOR Y12, Y13, Y12
+ VPXOR Y9, Y11, Y9
+ VMOVDQU Y12, (CX)
+ VMOVDQU Y9, 32(CX)
+ ADDQ $0x40, CX
+ ADDQ $0x40, DX
+ SUBQ $0x40, AX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx512_0(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), DX
+ VBROADCASTI128 (DX), Y1
+ VBROADCASTI128 64(DX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(DX), Y1
+ VBROADCASTI128 80(DX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(DX), Y1
+ VBROADCASTI128 96(DX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(DX), Y1
+ VBROADCASTI128 112(DX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z24
+ VMOVAPS Z0, Z25
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z26
+ VMOVAPS Z0, Z27
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z28
+ VMOVAPS Z0, Z29
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z30
+ VMOVAPS Z0, Z31
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), DX
+ MOVQ 8(DX), BX
+ XORQ SI, SI
+ MOVQ (DX)(SI*1), DI
+ ADDQ AX, SI
+ MOVQ (DX)(SI*1), R8
+ ADDQ AX, SI
+ MOVQ (DX)(SI*1), R9
+ ADDQ AX, SI
+ MOVQ (DX)(SI*1), AX
+
+loop:
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(DI), Y2
+ VMOVDQU (R8), Y3
+ VMOVDQU 32(R8), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VPSRLQ $0x04, Y3, Y6
+ VPAND Y0, Y3, Y5
+ VPAND Y0, Y6, Y6
+ VPSHUFB Y5, Y24, Y7
+ VPSHUFB Y5, Y25, Y5
+ VPSHUFB Y6, Y26, Y8
+ VPSHUFB Y6, Y27, Y6
+ VPXOR Y7, Y8, Y7
+ VPXOR Y5, Y6, Y5
+ VPAND Y4, Y0, Y6
+ VPSRLQ $0x04, Y4, Y8
+ VPAND Y0, Y8, Y8
+ VPSHUFB Y6, Y28, Y9
+ VPSHUFB Y6, Y29, Y6
+ VPXOR Y7, Y9, Y7
+ VPXOR Y5, Y6, Y5
+ VPSHUFB Y8, Y30, Y9
+ VPSHUFB Y8, Y31, Y6
+ VPTERNLOGD $0x96, Y7, Y9, Y1
+ VPTERNLOGD $0x96, Y5, Y6, Y2
+ VMOVDQU (R9), Y5
+ VMOVDQU 32(R9), Y6
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y5
+ VPTERNLOGD $0x96, Y9, Y10, Y6
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y3
+ VPTERNLOGD $0x96, Y9, Y10, Y4
+ VMOVDQU Y1, (DI)
+ VMOVDQU Y2, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y3, (R8)
+ VMOVDQU Y4, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y5, (R9)
+ VMOVDQU Y6, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, BX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx512_0(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), DX
+ VBROADCASTI128 (DX), Y1
+ VBROADCASTI128 64(DX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(DX), Y1
+ VBROADCASTI128 80(DX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(DX), Y1
+ VBROADCASTI128 96(DX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(DX), Y1
+ VBROADCASTI128 112(DX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z24
+ VMOVAPS Z0, Z25
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z26
+ VMOVAPS Z0, Z27
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z28
+ VMOVAPS Z0, Z29
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z30
+ VMOVAPS Z0, Z31
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), DX
+ MOVQ 8(DX), BX
+ XORQ SI, SI
+ MOVQ (DX)(SI*1), DI
+ ADDQ AX, SI
+ MOVQ (DX)(SI*1), R8
+ ADDQ AX, SI
+ MOVQ (DX)(SI*1), R9
+ ADDQ AX, SI
+ MOVQ (DX)(SI*1), AX
+
+loop:
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(DI), Y2
+ VMOVDQU (R9), Y5
+ VMOVDQU 32(R9), Y6
+ VMOVDQU (R8), Y3
+ VMOVDQU 32(R8), Y4
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y3
+ VPTERNLOGD $0x96, Y9, Y10, Y4
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y3, Y10
+ VPAND Y0, Y3, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y24, Y11
+ VPSHUFB Y9, Y25, Y9
+ VPSHUFB Y10, Y26, Y12
+ VPSHUFB Y10, Y27, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y4, Y0, Y10
+ VPSRLQ $0x04, Y4, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y28, Y13
+ VPSHUFB Y10, Y29, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y30, Y13
+ VPSHUFB Y12, Y31, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (DI)
+ VMOVDQU Y2, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y3, (R8)
+ VMOVDQU Y4, 32(R8)
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y7, Y2
+ VPAND Y0, Y7, Y1
+ VPAND Y0, Y2, Y2
+ VBROADCASTI128 (CX), Y3
+ VBROADCASTI128 64(CX), Y4
+ VPSHUFB Y1, Y3, Y3
+ VPSHUFB Y1, Y4, Y1
+ VBROADCASTI128 16(CX), Y4
+ VBROADCASTI128 80(CX), Y9
+ VPSHUFB Y2, Y4, Y4
+ VPSHUFB Y2, Y9, Y2
+ VPXOR Y3, Y4, Y3
+ VPXOR Y1, Y2, Y1
+ VPAND Y8, Y0, Y2
+ VPSRLQ $0x04, Y8, Y4
+ VPAND Y0, Y4, Y4
+ VBROADCASTI128 32(CX), Y9
+ VBROADCASTI128 96(CX), Y10
+ VPSHUFB Y2, Y9, Y9
+ VPSHUFB Y2, Y10, Y2
+ VPXOR Y3, Y9, Y3
+ VPXOR Y1, Y2, Y1
+ VBROADCASTI128 48(CX), Y9
+ VBROADCASTI128 112(CX), Y2
+ VPSHUFB Y4, Y9, Y9
+ VPSHUFB Y4, Y2, Y2
+ VPTERNLOGD $0x96, Y3, Y9, Y5
+ VPTERNLOGD $0x96, Y1, Y2, Y6
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R9)
+ VMOVDQU Y6, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, BX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx512_1(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), CX
+ VBROADCASTI128 (CX), Y1
+ VBROADCASTI128 64(CX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(CX), Y1
+ VBROADCASTI128 80(CX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(CX), Y1
+ VBROADCASTI128 96(CX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(CX), Y1
+ VBROADCASTI128 112(CX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z24
+ VMOVAPS Z0, Z25
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z26
+ VMOVAPS Z0, Z27
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z28
+ VMOVAPS Z0, Z29
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z30
+ VMOVAPS Z0, Z31
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y24, Y11
+ VPSHUFB Y9, Y25, Y9
+ VPSHUFB Y10, Y26, Y12
+ VPSHUFB Y10, Y27, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y28, Y13
+ VPSHUFB Y10, Y29, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y30, Y13
+ VPSHUFB Y12, Y31, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y5
+ VPTERNLOGD $0x96, Y9, Y10, Y6
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y3
+ VPTERNLOGD $0x96, Y9, Y10, Y4
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx512_1(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), DX
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ VBROADCASTI128 (CX), Y1
+ VBROADCASTI128 64(CX), Y0
+ VMOVAPS Z1, Z24
+ VMOVAPS Z0, Z25
+ VBROADCASTI128 16(CX), Y1
+ VBROADCASTI128 80(CX), Y0
+ VMOVAPS Z1, Z26
+ VMOVAPS Z0, Z27
+ VBROADCASTI128 32(CX), Y1
+ VBROADCASTI128 96(CX), Y0
+ VMOVAPS Z1, Z28
+ VMOVAPS Z0, Z29
+ VBROADCASTI128 48(CX), Y1
+ VBROADCASTI128 112(CX), Y0
+ VMOVAPS Z1, Z30
+ VMOVAPS Z0, Z31
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y3, Y10
+ VPAND Y0, Y3, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y4, Y0, Y10
+ VPSRLQ $0x04, Y4, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y7, Y2
+ VPAND Y0, Y7, Y1
+ VPAND Y0, Y2, Y2
+ VPSHUFB Y1, Y24, Y3
+ VPSHUFB Y1, Y25, Y1
+ VPSHUFB Y2, Y26, Y4
+ VPSHUFB Y2, Y27, Y2
+ VPXOR Y3, Y4, Y3
+ VPXOR Y1, Y2, Y1
+ VPAND Y8, Y0, Y2
+ VPSRLQ $0x04, Y8, Y4
+ VPAND Y0, Y4, Y4
+ VPSHUFB Y2, Y28, Y9
+ VPSHUFB Y2, Y29, Y2
+ VPXOR Y3, Y9, Y3
+ VPXOR Y1, Y2, Y1
+ VPSHUFB Y4, Y30, Y9
+ VPSHUFB Y4, Y31, Y2
+ VPTERNLOGD $0x96, Y3, Y9, Y5
+ VPTERNLOGD $0x96, Y1, Y2, Y6
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx512_2(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), CX
+ VBROADCASTI128 (CX), Y1
+ VBROADCASTI128 64(CX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(CX), Y1
+ VBROADCASTI128 80(CX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(CX), Y1
+ VBROADCASTI128 96(CX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(CX), Y1
+ VBROADCASTI128 112(CX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z24
+ VMOVAPS Z0, Z25
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z26
+ VMOVAPS Z0, Z27
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z28
+ VMOVAPS Z0, Z29
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z30
+ VMOVAPS Z0, Z31
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VPSRLQ $0x04, Y3, Y6
+ VPAND Y0, Y3, Y5
+ VPAND Y0, Y6, Y6
+ VPSHUFB Y5, Y24, Y7
+ VPSHUFB Y5, Y25, Y5
+ VPSHUFB Y6, Y26, Y8
+ VPSHUFB Y6, Y27, Y6
+ VPXOR Y7, Y8, Y7
+ VPXOR Y5, Y6, Y5
+ VPAND Y4, Y0, Y6
+ VPSRLQ $0x04, Y4, Y8
+ VPAND Y0, Y8, Y8
+ VPSHUFB Y6, Y28, Y9
+ VPSHUFB Y6, Y29, Y6
+ VPXOR Y7, Y9, Y7
+ VPXOR Y5, Y6, Y5
+ VPSHUFB Y8, Y30, Y9
+ VPSHUFB Y8, Y31, Y6
+ VPTERNLOGD $0x96, Y7, Y9, Y1
+ VPTERNLOGD $0x96, Y5, Y6, Y2
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y3
+ VPTERNLOGD $0x96, Y9, Y10, Y4
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx512_2(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), CX
+ VBROADCASTI128 (CX), Y1
+ VBROADCASTI128 64(CX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(CX), Y1
+ VBROADCASTI128 80(CX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(CX), Y1
+ VBROADCASTI128 96(CX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(CX), Y1
+ VBROADCASTI128 112(CX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z24
+ VMOVAPS Z0, Z25
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z26
+ VMOVAPS Z0, Z27
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z28
+ VMOVAPS Z0, Z29
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z30
+ VMOVAPS Z0, Z31
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y3
+ VPTERNLOGD $0x96, Y9, Y10, Y4
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y7, Y2
+ VPAND Y0, Y7, Y1
+ VPAND Y0, Y2, Y2
+ VPSHUFB Y1, Y24, Y3
+ VPSHUFB Y1, Y25, Y1
+ VPSHUFB Y2, Y26, Y4
+ VPSHUFB Y2, Y27, Y2
+ VPXOR Y3, Y4, Y3
+ VPXOR Y1, Y2, Y1
+ VPAND Y8, Y0, Y2
+ VPSRLQ $0x04, Y8, Y4
+ VPAND Y0, Y4, Y4
+ VPSHUFB Y2, Y28, Y9
+ VPSHUFB Y2, Y29, Y2
+ VPXOR Y3, Y9, Y3
+ VPXOR Y1, Y2, Y1
+ VPSHUFB Y4, Y30, Y9
+ VPSHUFB Y4, Y31, Y2
+ VPTERNLOGD $0x96, Y3, Y9, Y5
+ VPTERNLOGD $0x96, Y1, Y2, Y6
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx512_3(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), AX
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y3
+ VPTERNLOGD $0x96, Y9, Y10, Y4
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx512_3(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), CX
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VPSRLQ $0x04, Y7, Y2
+ VPAND Y0, Y7, Y1
+ VPAND Y0, Y2, Y2
+ VPSHUFB Y1, Y16, Y3
+ VPSHUFB Y1, Y17, Y1
+ VPSHUFB Y2, Y18, Y4
+ VPSHUFB Y2, Y19, Y2
+ VPXOR Y3, Y4, Y3
+ VPXOR Y1, Y2, Y1
+ VPAND Y8, Y0, Y2
+ VPSRLQ $0x04, Y8, Y4
+ VPAND Y0, Y4, Y4
+ VPSHUFB Y2, Y20, Y9
+ VPSHUFB Y2, Y21, Y2
+ VPXOR Y3, Y9, Y3
+ VPXOR Y1, Y2, Y1
+ VPSHUFB Y4, Y22, Y9
+ VPSHUFB Y4, Y23, Y2
+ VPTERNLOGD $0x96, Y3, Y9, Y5
+ VPTERNLOGD $0x96, Y1, Y2, Y6
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx512_4(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), DX
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ VBROADCASTI128 (CX), Y1
+ VBROADCASTI128 64(CX), Y0
+ VMOVAPS Z1, Z24
+ VMOVAPS Z0, Z25
+ VBROADCASTI128 16(CX), Y1
+ VBROADCASTI128 80(CX), Y0
+ VMOVAPS Z1, Z26
+ VMOVAPS Z0, Z27
+ VBROADCASTI128 32(CX), Y1
+ VBROADCASTI128 96(CX), Y0
+ VMOVAPS Z1, Z28
+ VMOVAPS Z0, Z29
+ VBROADCASTI128 48(CX), Y1
+ VBROADCASTI128 112(CX), Y0
+ VMOVAPS Z1, Z30
+ VMOVAPS Z0, Z31
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VPSRLQ $0x04, Y3, Y6
+ VPAND Y0, Y3, Y5
+ VPAND Y0, Y6, Y6
+ VPSHUFB Y5, Y16, Y7
+ VPSHUFB Y5, Y17, Y5
+ VPSHUFB Y6, Y18, Y8
+ VPSHUFB Y6, Y19, Y6
+ VPXOR Y7, Y8, Y7
+ VPXOR Y5, Y6, Y5
+ VPAND Y4, Y0, Y6
+ VPSRLQ $0x04, Y4, Y8
+ VPAND Y0, Y8, Y8
+ VPSHUFB Y6, Y20, Y9
+ VPSHUFB Y6, Y21, Y6
+ VPXOR Y7, Y9, Y7
+ VPXOR Y5, Y6, Y5
+ VPSHUFB Y8, Y22, Y9
+ VPSHUFB Y8, Y23, Y6
+ VPTERNLOGD $0x96, Y7, Y9, Y1
+ VPTERNLOGD $0x96, Y5, Y6, Y2
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y24, Y11
+ VPSHUFB Y9, Y25, Y9
+ VPSHUFB Y10, Y26, Y12
+ VPSHUFB Y10, Y27, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y28, Y13
+ VPSHUFB Y10, Y29, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y30, Y13
+ VPSHUFB Y12, Y31, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y5
+ VPTERNLOGD $0x96, Y9, Y10, Y6
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx512_4(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), CX
+ VBROADCASTI128 (CX), Y1
+ VBROADCASTI128 64(CX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(CX), Y1
+ VBROADCASTI128 80(CX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(CX), Y1
+ VBROADCASTI128 96(CX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(CX), Y1
+ VBROADCASTI128 112(CX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z24
+ VMOVAPS Z0, Z25
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z26
+ VMOVAPS Z0, Z27
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z28
+ VMOVAPS Z0, Z29
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z30
+ VMOVAPS Z0, Z31
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y3
+ VPTERNLOGD $0x96, Y9, Y10, Y4
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y3, Y10
+ VPAND Y0, Y3, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y24, Y11
+ VPSHUFB Y9, Y25, Y9
+ VPSHUFB Y10, Y26, Y12
+ VPSHUFB Y10, Y27, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y4, Y0, Y10
+ VPSRLQ $0x04, Y4, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y28, Y13
+ VPSHUFB Y10, Y29, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y30, Y13
+ VPSHUFB Y12, Y31, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx512_5(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), CX
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y5
+ VPTERNLOGD $0x96, Y9, Y10, Y6
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx512_5(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), CX
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y3, Y10
+ VPAND Y0, Y3, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y4, Y0, Y10
+ VPSRLQ $0x04, Y4, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx512_6(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), CX
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VPSRLQ $0x04, Y3, Y6
+ VPAND Y0, Y3, Y5
+ VPAND Y0, Y6, Y6
+ VPSHUFB Y5, Y16, Y7
+ VPSHUFB Y5, Y17, Y5
+ VPSHUFB Y6, Y18, Y8
+ VPSHUFB Y6, Y19, Y6
+ VPXOR Y7, Y8, Y7
+ VPXOR Y5, Y6, Y5
+ VPAND Y4, Y0, Y6
+ VPSRLQ $0x04, Y4, Y8
+ VPAND Y0, Y8, Y8
+ VPSHUFB Y6, Y20, Y9
+ VPSHUFB Y6, Y21, Y6
+ VPXOR Y7, Y9, Y7
+ VPXOR Y5, Y6, Y5
+ VPSHUFB Y8, Y22, Y9
+ VPSHUFB Y8, Y23, Y6
+ VPTERNLOGD $0x96, Y7, Y9, Y1
+ VPTERNLOGD $0x96, Y5, Y6, Y2
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx512_6(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), AX
+ VBROADCASTI128 (AX), Y1
+ VBROADCASTI128 64(AX), Y0
+ VMOVAPS Z1, Z16
+ VMOVAPS Z0, Z17
+ VBROADCASTI128 16(AX), Y1
+ VBROADCASTI128 80(AX), Y0
+ VMOVAPS Z1, Z18
+ VMOVAPS Z0, Z19
+ VBROADCASTI128 32(AX), Y1
+ VBROADCASTI128 96(AX), Y0
+ VMOVAPS Z1, Z20
+ VMOVAPS Z0, Z21
+ VBROADCASTI128 48(AX), Y1
+ VBROADCASTI128 112(AX), Y0
+ VMOVAPS Z1, Z22
+ VMOVAPS Z0, Z23
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(DI), Y4
+ VMOVDQU (AX), Y7
+ VMOVDQU 32(AX), Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y1
+ VPTERNLOGD $0x96, Y9, Y10, Y2
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VPSHUFB Y9, Y16, Y11
+ VPSHUFB Y9, Y17, Y9
+ VPSHUFB Y10, Y18, Y12
+ VPSHUFB Y10, Y19, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VPSHUFB Y10, Y20, Y13
+ VPSHUFB Y10, Y21, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VPSHUFB Y12, Y22, Y13
+ VPSHUFB Y12, Y23, Y10
+ VPTERNLOGD $0x96, Y11, Y13, Y3
+ VPTERNLOGD $0x96, Y9, Y10, Y4
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y4, 32(DI)
+ ADDQ $0x40, DI
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (AX)
+ VMOVDQU Y8, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, SSE2
+TEXT ·ifftDIT4_avx512_7(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), AX
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y0
+ VMOVDQU 32(SI), Y1
+ VMOVDQU (DI), Y2
+ VMOVDQU 32(DI), Y3
+ VPXOR Y0, Y2, Y2
+ VPXOR Y1, Y3, Y3
+ VMOVDQU (R8), Y4
+ VMOVDQU 32(R8), Y5
+ VMOVDQU (AX), Y6
+ VMOVDQU 32(AX), Y7
+ VPXOR Y4, Y6, Y6
+ VPXOR Y5, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y2, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y4, (R8)
+ VMOVDQU Y5, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y6, (AX)
+ VMOVDQU Y7, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, SSE2
+TEXT ·fftDIT4_avx512_7(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), AX
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y0
+ VMOVDQU 32(SI), Y1
+ VMOVDQU (R8), Y4
+ VMOVDQU 32(R8), Y5
+ VMOVDQU (DI), Y2
+ VMOVDQU 32(DI), Y3
+ VMOVDQU (AX), Y6
+ VMOVDQU 32(AX), Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y0, Y2, Y2
+ VPXOR Y1, Y3, Y3
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y2, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+ VPXOR Y4, Y6, Y6
+ VPXOR Y5, Y7, Y7
+ VMOVDQU Y4, (R8)
+ VMOVDQU Y5, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y6, (AX)
+ VMOVDQU Y7, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx2_0(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), DX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), BX
+ MOVQ work_base+0(FP), SI
+ MOVQ 8(SI), DI
+ XORQ R8, R8
+ MOVQ (SI)(R8*1), R9
+ ADDQ BX, R8
+ MOVQ (SI)(R8*1), R10
+ ADDQ BX, R8
+ MOVQ (SI)(R8*1), R11
+ ADDQ BX, R8
+ MOVQ (SI)(R8*1), BX
+
+loop:
+ VMOVDQU (R9), Y1
+ VMOVDQU 32(R9), Y2
+ VMOVDQU (R10), Y3
+ VMOVDQU 32(R10), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VPSRLQ $0x04, Y3, Y6
+ VPAND Y0, Y3, Y5
+ VPAND Y0, Y6, Y6
+ VBROADCASTI128 (AX), Y7
+ VBROADCASTI128 64(AX), Y8
+ VPSHUFB Y5, Y7, Y7
+ VPSHUFB Y5, Y8, Y5
+ VBROADCASTI128 16(AX), Y8
+ VBROADCASTI128 80(AX), Y9
+ VPSHUFB Y6, Y8, Y8
+ VPSHUFB Y6, Y9, Y6
+ VPXOR Y7, Y8, Y7
+ VPXOR Y5, Y6, Y5
+ VPAND Y4, Y0, Y6
+ VPSRLQ $0x04, Y4, Y8
+ VPAND Y0, Y8, Y8
+ VBROADCASTI128 32(AX), Y9
+ VBROADCASTI128 96(AX), Y10
+ VPSHUFB Y6, Y9, Y9
+ VPSHUFB Y6, Y10, Y6
+ VPXOR Y7, Y9, Y7
+ VPXOR Y5, Y6, Y5
+ VBROADCASTI128 48(AX), Y9
+ VBROADCASTI128 112(AX), Y6
+ VPSHUFB Y8, Y9, Y9
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y7, Y9, Y1)
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R11), Y5
+ VMOVDQU 32(R11), Y6
+ VMOVDQU (BX), Y7
+ VMOVDQU 32(BX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y5)
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (DX), Y11
+ VBROADCASTI128 64(DX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(DX), Y12
+ VBROADCASTI128 80(DX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(DX), Y13
+ VBROADCASTI128 96(DX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(DX), Y13
+ VBROADCASTI128 112(DX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (DX), Y11
+ VBROADCASTI128 64(DX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(DX), Y12
+ VBROADCASTI128 80(DX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(DX), Y13
+ VBROADCASTI128 96(DX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(DX), Y13
+ VBROADCASTI128 112(DX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y3)
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU Y1, (R9)
+ VMOVDQU Y2, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y3, (R10)
+ VMOVDQU Y4, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y5, (R11)
+ VMOVDQU Y6, 32(R11)
+ ADDQ $0x40, R11
+ VMOVDQU Y7, (BX)
+ VMOVDQU Y8, 32(BX)
+ ADDQ $0x40, BX
+ SUBQ $0x40, DI
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx2_0(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), DX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), BX
+ MOVQ work_base+0(FP), SI
+ MOVQ 8(SI), DI
+ XORQ R8, R8
+ MOVQ (SI)(R8*1), R9
+ ADDQ BX, R8
+ MOVQ (SI)(R8*1), R10
+ ADDQ BX, R8
+ MOVQ (SI)(R8*1), R11
+ ADDQ BX, R8
+ MOVQ (SI)(R8*1), BX
+
+loop:
+ VMOVDQU (R9), Y1
+ VMOVDQU 32(R9), Y2
+ VMOVDQU (R11), Y5
+ VMOVDQU 32(R11), Y6
+ VMOVDQU (R10), Y3
+ VMOVDQU 32(R10), Y4
+ VMOVDQU (BX), Y7
+ VMOVDQU 32(BX), Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (DX), Y11
+ VBROADCASTI128 64(DX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(DX), Y12
+ VBROADCASTI128 80(DX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(DX), Y13
+ VBROADCASTI128 96(DX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(DX), Y13
+ VBROADCASTI128 112(DX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (DX), Y11
+ VBROADCASTI128 64(DX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(DX), Y12
+ VBROADCASTI128 80(DX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(DX), Y13
+ VBROADCASTI128 96(DX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(DX), Y13
+ VBROADCASTI128 112(DX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y3)
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y3, Y10
+ VPAND Y0, Y3, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y4, Y0, Y10
+ VPSRLQ $0x04, Y4, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (R9)
+ VMOVDQU Y2, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y3, (R10)
+ VMOVDQU Y4, 32(R10)
+ ADDQ $0x40, R10
+ VPSRLQ $0x04, Y7, Y2
+ VPAND Y0, Y7, Y1
+ VPAND Y0, Y2, Y2
+ VBROADCASTI128 (CX), Y3
+ VBROADCASTI128 64(CX), Y4
+ VPSHUFB Y1, Y3, Y3
+ VPSHUFB Y1, Y4, Y1
+ VBROADCASTI128 16(CX), Y4
+ VBROADCASTI128 80(CX), Y9
+ VPSHUFB Y2, Y4, Y4
+ VPSHUFB Y2, Y9, Y2
+ VPXOR Y3, Y4, Y3
+ VPXOR Y1, Y2, Y1
+ VPAND Y8, Y0, Y2
+ VPSRLQ $0x04, Y8, Y4
+ VPAND Y0, Y4, Y4
+ VBROADCASTI128 32(CX), Y9
+ VBROADCASTI128 96(CX), Y10
+ VPSHUFB Y2, Y9, Y9
+ VPSHUFB Y2, Y10, Y2
+ VPXOR Y3, Y9, Y3
+ VPXOR Y1, Y2, Y1
+ VBROADCASTI128 48(CX), Y9
+ VBROADCASTI128 112(CX), Y2
+ VPSHUFB Y4, Y9, Y9
+ VPSHUFB Y4, Y2, Y2
+ XOR3WAY( $0x00, Y3, Y9, Y5)
+ XOR3WAY( $0x00, Y1, Y2, Y6)
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R11)
+ VMOVDQU Y6, 32(R11)
+ ADDQ $0x40, R11
+ VMOVDQU Y7, (BX)
+ VMOVDQU Y8, 32(BX)
+ ADDQ $0x40, BX
+ SUBQ $0x40, DI
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx2_1(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), CX
+ MOVQ $0x0000000f, DX
+ MOVQ DX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), DX
+ MOVQ work_base+0(FP), BX
+ MOVQ 8(BX), SI
+ XORQ DI, DI
+ MOVQ (BX)(DI*1), R8
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R9
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R10
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), DX
+
+loop:
+ VMOVDQU (R8), Y1
+ VMOVDQU 32(R8), Y2
+ VMOVDQU (R9), Y3
+ VMOVDQU 32(R9), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU (R10), Y5
+ VMOVDQU 32(R10), Y6
+ VMOVDQU (DX), Y7
+ VMOVDQU 32(DX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y5)
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y3)
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU Y1, (R8)
+ VMOVDQU Y2, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y3, (R9)
+ VMOVDQU Y4, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y5, (R10)
+ VMOVDQU Y6, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y7, (DX)
+ VMOVDQU Y8, 32(DX)
+ ADDQ $0x40, DX
+ SUBQ $0x40, SI
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx2_1(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), DX
+ MOVQ $0x0000000f, DX
+ MOVQ DX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), DX
+ MOVQ work_base+0(FP), BX
+ MOVQ 8(BX), SI
+ XORQ DI, DI
+ MOVQ (BX)(DI*1), R8
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R9
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R10
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), DX
+
+loop:
+ VMOVDQU (R8), Y1
+ VMOVDQU 32(R8), Y2
+ VMOVDQU (R10), Y5
+ VMOVDQU 32(R10), Y6
+ VMOVDQU (R9), Y3
+ VMOVDQU 32(R9), Y4
+ VMOVDQU (DX), Y7
+ VMOVDQU 32(DX), Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y3, Y10
+ VPAND Y0, Y3, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y4, Y0, Y10
+ VPSRLQ $0x04, Y4, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (R8)
+ VMOVDQU Y2, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y3, (R9)
+ VMOVDQU Y4, 32(R9)
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y7, Y2
+ VPAND Y0, Y7, Y1
+ VPAND Y0, Y2, Y2
+ VBROADCASTI128 (CX), Y3
+ VBROADCASTI128 64(CX), Y4
+ VPSHUFB Y1, Y3, Y3
+ VPSHUFB Y1, Y4, Y1
+ VBROADCASTI128 16(CX), Y4
+ VBROADCASTI128 80(CX), Y9
+ VPSHUFB Y2, Y4, Y4
+ VPSHUFB Y2, Y9, Y2
+ VPXOR Y3, Y4, Y3
+ VPXOR Y1, Y2, Y1
+ VPAND Y8, Y0, Y2
+ VPSRLQ $0x04, Y8, Y4
+ VPAND Y0, Y4, Y4
+ VBROADCASTI128 32(CX), Y9
+ VBROADCASTI128 96(CX), Y10
+ VPSHUFB Y2, Y9, Y9
+ VPSHUFB Y2, Y10, Y2
+ VPXOR Y3, Y9, Y3
+ VPXOR Y1, Y2, Y1
+ VBROADCASTI128 48(CX), Y9
+ VBROADCASTI128 112(CX), Y2
+ VPSHUFB Y4, Y9, Y9
+ VPSHUFB Y4, Y2, Y2
+ XOR3WAY( $0x00, Y3, Y9, Y5)
+ XOR3WAY( $0x00, Y1, Y2, Y6)
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R10)
+ VMOVDQU Y6, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y7, (DX)
+ VMOVDQU Y8, 32(DX)
+ ADDQ $0x40, DX
+ SUBQ $0x40, SI
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx2_2(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), CX
+ MOVQ $0x0000000f, DX
+ MOVQ DX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), DX
+ MOVQ work_base+0(FP), BX
+ MOVQ 8(BX), SI
+ XORQ DI, DI
+ MOVQ (BX)(DI*1), R8
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R9
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R10
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), DX
+
+loop:
+ VMOVDQU (R8), Y1
+ VMOVDQU 32(R8), Y2
+ VMOVDQU (R9), Y3
+ VMOVDQU 32(R9), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VPSRLQ $0x04, Y3, Y6
+ VPAND Y0, Y3, Y5
+ VPAND Y0, Y6, Y6
+ VBROADCASTI128 (AX), Y7
+ VBROADCASTI128 64(AX), Y8
+ VPSHUFB Y5, Y7, Y7
+ VPSHUFB Y5, Y8, Y5
+ VBROADCASTI128 16(AX), Y8
+ VBROADCASTI128 80(AX), Y9
+ VPSHUFB Y6, Y8, Y8
+ VPSHUFB Y6, Y9, Y6
+ VPXOR Y7, Y8, Y7
+ VPXOR Y5, Y6, Y5
+ VPAND Y4, Y0, Y6
+ VPSRLQ $0x04, Y4, Y8
+ VPAND Y0, Y8, Y8
+ VBROADCASTI128 32(AX), Y9
+ VBROADCASTI128 96(AX), Y10
+ VPSHUFB Y6, Y9, Y9
+ VPSHUFB Y6, Y10, Y6
+ VPXOR Y7, Y9, Y7
+ VPXOR Y5, Y6, Y5
+ VBROADCASTI128 48(AX), Y9
+ VBROADCASTI128 112(AX), Y6
+ VPSHUFB Y8, Y9, Y9
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y7, Y9, Y1)
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R10), Y5
+ VMOVDQU 32(R10), Y6
+ VMOVDQU (DX), Y7
+ VMOVDQU 32(DX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y3)
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU Y1, (R8)
+ VMOVDQU Y2, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y3, (R9)
+ VMOVDQU Y4, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y5, (R10)
+ VMOVDQU Y6, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y7, (DX)
+ VMOVDQU Y8, 32(DX)
+ ADDQ $0x40, DX
+ SUBQ $0x40, SI
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx2_2(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), CX
+ MOVQ $0x0000000f, DX
+ MOVQ DX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), DX
+ MOVQ work_base+0(FP), BX
+ MOVQ 8(BX), SI
+ XORQ DI, DI
+ MOVQ (BX)(DI*1), R8
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R9
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R10
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), DX
+
+loop:
+ VMOVDQU (R8), Y1
+ VMOVDQU 32(R8), Y2
+ VMOVDQU (R10), Y5
+ VMOVDQU 32(R10), Y6
+ VMOVDQU (R9), Y3
+ VMOVDQU 32(R9), Y4
+ VMOVDQU (DX), Y7
+ VMOVDQU 32(DX), Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y3)
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (R8)
+ VMOVDQU Y2, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y3, (R9)
+ VMOVDQU Y4, 32(R9)
+ ADDQ $0x40, R9
+ VPSRLQ $0x04, Y7, Y2
+ VPAND Y0, Y7, Y1
+ VPAND Y0, Y2, Y2
+ VBROADCASTI128 (AX), Y3
+ VBROADCASTI128 64(AX), Y4
+ VPSHUFB Y1, Y3, Y3
+ VPSHUFB Y1, Y4, Y1
+ VBROADCASTI128 16(AX), Y4
+ VBROADCASTI128 80(AX), Y9
+ VPSHUFB Y2, Y4, Y4
+ VPSHUFB Y2, Y9, Y2
+ VPXOR Y3, Y4, Y3
+ VPXOR Y1, Y2, Y1
+ VPAND Y8, Y0, Y2
+ VPSRLQ $0x04, Y8, Y4
+ VPAND Y0, Y4, Y4
+ VBROADCASTI128 32(AX), Y9
+ VBROADCASTI128 96(AX), Y10
+ VPSHUFB Y2, Y9, Y9
+ VPSHUFB Y2, Y10, Y2
+ VPXOR Y3, Y9, Y3
+ VPXOR Y1, Y2, Y1
+ VBROADCASTI128 48(AX), Y9
+ VBROADCASTI128 112(AX), Y2
+ VPSHUFB Y4, Y9, Y9
+ VPSHUFB Y4, Y2, Y2
+ XOR3WAY( $0x00, Y3, Y9, Y5)
+ XOR3WAY( $0x00, Y1, Y2, Y6)
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R10)
+ VMOVDQU Y6, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y7, (DX)
+ VMOVDQU Y8, 32(DX)
+ ADDQ $0x40, DX
+ SUBQ $0x40, SI
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx2_3(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), CX
+ MOVQ work_base+0(FP), DX
+ MOVQ 8(DX), BX
+ XORQ SI, SI
+ MOVQ (DX)(SI*1), DI
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R8
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R9
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), CX
+
+loop:
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(DI), Y2
+ VMOVDQU (R8), Y3
+ VMOVDQU 32(R8), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU (R9), Y5
+ VMOVDQU 32(R9), Y6
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y3)
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VMOVDQU Y1, (DI)
+ VMOVDQU Y2, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y3, (R8)
+ VMOVDQU Y4, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y5, (R9)
+ VMOVDQU Y6, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y7, (CX)
+ VMOVDQU Y8, 32(CX)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx2_3(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), CX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), CX
+ MOVQ work_base+0(FP), DX
+ MOVQ 8(DX), BX
+ XORQ SI, SI
+ MOVQ (DX)(SI*1), DI
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R8
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R9
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), CX
+
+loop:
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(DI), Y2
+ VMOVDQU (R9), Y5
+ VMOVDQU 32(R9), Y6
+ VMOVDQU (R8), Y3
+ VMOVDQU 32(R8), Y4
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (DI)
+ VMOVDQU Y2, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y3, (R8)
+ VMOVDQU Y4, 32(R8)
+ ADDQ $0x40, R8
+ VPSRLQ $0x04, Y7, Y2
+ VPAND Y0, Y7, Y1
+ VPAND Y0, Y2, Y2
+ VBROADCASTI128 (AX), Y3
+ VBROADCASTI128 64(AX), Y4
+ VPSHUFB Y1, Y3, Y3
+ VPSHUFB Y1, Y4, Y1
+ VBROADCASTI128 16(AX), Y4
+ VBROADCASTI128 80(AX), Y9
+ VPSHUFB Y2, Y4, Y4
+ VPSHUFB Y2, Y9, Y2
+ VPXOR Y3, Y4, Y3
+ VPXOR Y1, Y2, Y1
+ VPAND Y8, Y0, Y2
+ VPSRLQ $0x04, Y8, Y4
+ VPAND Y0, Y4, Y4
+ VBROADCASTI128 32(AX), Y9
+ VBROADCASTI128 96(AX), Y10
+ VPSHUFB Y2, Y9, Y9
+ VPSHUFB Y2, Y10, Y2
+ VPXOR Y3, Y9, Y3
+ VPXOR Y1, Y2, Y1
+ VBROADCASTI128 48(AX), Y9
+ VBROADCASTI128 112(AX), Y2
+ VPSHUFB Y4, Y9, Y9
+ VPSHUFB Y4, Y2, Y2
+ XOR3WAY( $0x00, Y3, Y9, Y5)
+ XOR3WAY( $0x00, Y1, Y2, Y6)
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R9)
+ VMOVDQU Y6, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y7, (CX)
+ VMOVDQU Y8, 32(CX)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx2_4(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), DX
+ MOVQ $0x0000000f, DX
+ MOVQ DX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), DX
+ MOVQ work_base+0(FP), BX
+ MOVQ 8(BX), SI
+ XORQ DI, DI
+ MOVQ (BX)(DI*1), R8
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R9
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R10
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), DX
+
+loop:
+ VMOVDQU (R8), Y1
+ VMOVDQU 32(R8), Y2
+ VMOVDQU (R9), Y3
+ VMOVDQU 32(R9), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VPSRLQ $0x04, Y3, Y6
+ VPAND Y0, Y3, Y5
+ VPAND Y0, Y6, Y6
+ VBROADCASTI128 (AX), Y7
+ VBROADCASTI128 64(AX), Y8
+ VPSHUFB Y5, Y7, Y7
+ VPSHUFB Y5, Y8, Y5
+ VBROADCASTI128 16(AX), Y8
+ VBROADCASTI128 80(AX), Y9
+ VPSHUFB Y6, Y8, Y8
+ VPSHUFB Y6, Y9, Y6
+ VPXOR Y7, Y8, Y7
+ VPXOR Y5, Y6, Y5
+ VPAND Y4, Y0, Y6
+ VPSRLQ $0x04, Y4, Y8
+ VPAND Y0, Y8, Y8
+ VBROADCASTI128 32(AX), Y9
+ VBROADCASTI128 96(AX), Y10
+ VPSHUFB Y6, Y9, Y9
+ VPSHUFB Y6, Y10, Y6
+ VPXOR Y7, Y9, Y7
+ VPXOR Y5, Y6, Y5
+ VBROADCASTI128 48(AX), Y9
+ VBROADCASTI128 112(AX), Y6
+ VPSHUFB Y8, Y9, Y9
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y7, Y9, Y1)
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R10), Y5
+ VMOVDQU 32(R10), Y6
+ VMOVDQU (DX), Y7
+ VMOVDQU 32(DX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y5)
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VMOVDQU Y1, (R8)
+ VMOVDQU Y2, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y3, (R9)
+ VMOVDQU Y4, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y5, (R10)
+ VMOVDQU Y6, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y7, (DX)
+ VMOVDQU Y8, 32(DX)
+ ADDQ $0x40, DX
+ SUBQ $0x40, SI
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx2_4(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), CX
+ MOVQ $0x0000000f, DX
+ MOVQ DX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), DX
+ MOVQ work_base+0(FP), BX
+ MOVQ 8(BX), SI
+ XORQ DI, DI
+ MOVQ (BX)(DI*1), R8
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R9
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R10
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), DX
+
+loop:
+ VMOVDQU (R8), Y1
+ VMOVDQU 32(R8), Y2
+ VMOVDQU (R10), Y5
+ VMOVDQU 32(R10), Y6
+ VMOVDQU (R9), Y3
+ VMOVDQU 32(R9), Y4
+ VMOVDQU (DX), Y7
+ VMOVDQU 32(DX), Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (CX), Y11
+ VBROADCASTI128 64(CX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(CX), Y12
+ VBROADCASTI128 80(CX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(CX), Y13
+ VBROADCASTI128 96(CX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(CX), Y13
+ VBROADCASTI128 112(CX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y3)
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y3, Y10
+ VPAND Y0, Y3, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y4, Y0, Y10
+ VPSRLQ $0x04, Y4, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (R8)
+ VMOVDQU Y2, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y3, (R9)
+ VMOVDQU Y4, 32(R9)
+ ADDQ $0x40, R9
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R10)
+ VMOVDQU Y6, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y7, (DX)
+ VMOVDQU Y8, 32(DX)
+ ADDQ $0x40, DX
+ SUBQ $0x40, SI
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx2_5(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), CX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), CX
+ MOVQ work_base+0(FP), DX
+ MOVQ 8(DX), BX
+ XORQ SI, SI
+ MOVQ (DX)(SI*1), DI
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R8
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R9
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), CX
+
+loop:
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(DI), Y2
+ VMOVDQU (R8), Y3
+ VMOVDQU 32(R8), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU (R9), Y5
+ VMOVDQU 32(R9), Y6
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y5)
+ XOR3WAY( $0x00, Y9, Y10, Y6)
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VMOVDQU Y1, (DI)
+ VMOVDQU Y2, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y3, (R8)
+ VMOVDQU Y4, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y5, (R9)
+ VMOVDQU Y6, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y7, (CX)
+ VMOVDQU Y8, 32(CX)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx2_5(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), CX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), CX
+ MOVQ work_base+0(FP), DX
+ MOVQ 8(DX), BX
+ XORQ SI, SI
+ MOVQ (DX)(SI*1), DI
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R8
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R9
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), CX
+
+loop:
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(DI), Y2
+ VMOVDQU (R9), Y5
+ VMOVDQU 32(R9), Y6
+ VMOVDQU (R8), Y3
+ VMOVDQU 32(R8), Y4
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPSRLQ $0x04, Y3, Y10
+ VPAND Y0, Y3, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y4, Y0, Y10
+ VPSRLQ $0x04, Y4, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (DI)
+ VMOVDQU Y2, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y3, (R8)
+ VMOVDQU Y4, 32(R8)
+ ADDQ $0x40, R8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R9)
+ VMOVDQU Y6, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y7, (CX)
+ VMOVDQU Y8, 32(CX)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT4_avx2_6(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), CX
+ MOVQ table02+48(FP), CX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), CX
+ MOVQ work_base+0(FP), DX
+ MOVQ 8(DX), BX
+ XORQ SI, SI
+ MOVQ (DX)(SI*1), DI
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R8
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R9
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), CX
+
+loop:
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(DI), Y2
+ VMOVDQU (R8), Y3
+ VMOVDQU 32(R8), Y4
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VPSRLQ $0x04, Y3, Y6
+ VPAND Y0, Y3, Y5
+ VPAND Y0, Y6, Y6
+ VBROADCASTI128 (AX), Y7
+ VBROADCASTI128 64(AX), Y8
+ VPSHUFB Y5, Y7, Y7
+ VPSHUFB Y5, Y8, Y5
+ VBROADCASTI128 16(AX), Y8
+ VBROADCASTI128 80(AX), Y9
+ VPSHUFB Y6, Y8, Y8
+ VPSHUFB Y6, Y9, Y6
+ VPXOR Y7, Y8, Y7
+ VPXOR Y5, Y6, Y5
+ VPAND Y4, Y0, Y6
+ VPSRLQ $0x04, Y4, Y8
+ VPAND Y0, Y8, Y8
+ VBROADCASTI128 32(AX), Y9
+ VBROADCASTI128 96(AX), Y10
+ VPSHUFB Y6, Y9, Y9
+ VPSHUFB Y6, Y10, Y6
+ VPXOR Y7, Y9, Y7
+ VPXOR Y5, Y6, Y5
+ VBROADCASTI128 48(AX), Y9
+ VBROADCASTI128 112(AX), Y6
+ VPSHUFB Y8, Y9, Y9
+ VPSHUFB Y8, Y6, Y6
+ XOR3WAY( $0x00, Y7, Y9, Y1)
+ XOR3WAY( $0x00, Y5, Y6, Y2)
+ VMOVDQU (R9), Y5
+ VMOVDQU 32(R9), Y6
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VMOVDQU Y1, (DI)
+ VMOVDQU Y2, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y3, (R8)
+ VMOVDQU Y4, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y5, (R9)
+ VMOVDQU Y6, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y7, (CX)
+ VMOVDQU Y8, 32(CX)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT4_avx2_6(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), CX
+ MOVQ work_base+0(FP), DX
+ MOVQ 8(DX), BX
+ XORQ SI, SI
+ MOVQ (DX)(SI*1), DI
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R8
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R9
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), CX
+
+loop:
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(DI), Y2
+ VMOVDQU (R9), Y5
+ VMOVDQU 32(R9), Y6
+ VMOVDQU (R8), Y3
+ VMOVDQU 32(R8), Y4
+ VMOVDQU (CX), Y7
+ VMOVDQU 32(CX), Y8
+ VPSRLQ $0x04, Y5, Y10
+ VPAND Y0, Y5, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y6, Y0, Y10
+ VPSRLQ $0x04, Y6, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y1)
+ XOR3WAY( $0x00, Y9, Y10, Y2)
+ VPSRLQ $0x04, Y7, Y10
+ VPAND Y0, Y7, Y9
+ VPAND Y0, Y10, Y10
+ VBROADCASTI128 (AX), Y11
+ VBROADCASTI128 64(AX), Y12
+ VPSHUFB Y9, Y11, Y11
+ VPSHUFB Y9, Y12, Y9
+ VBROADCASTI128 16(AX), Y12
+ VBROADCASTI128 80(AX), Y13
+ VPSHUFB Y10, Y12, Y12
+ VPSHUFB Y10, Y13, Y10
+ VPXOR Y11, Y12, Y11
+ VPXOR Y9, Y10, Y9
+ VPAND Y8, Y0, Y10
+ VPSRLQ $0x04, Y8, Y12
+ VPAND Y0, Y12, Y12
+ VBROADCASTI128 32(AX), Y13
+ VBROADCASTI128 96(AX), Y14
+ VPSHUFB Y10, Y13, Y13
+ VPSHUFB Y10, Y14, Y10
+ VPXOR Y11, Y13, Y11
+ VPXOR Y9, Y10, Y9
+ VBROADCASTI128 48(AX), Y13
+ VBROADCASTI128 112(AX), Y10
+ VPSHUFB Y12, Y13, Y13
+ VPSHUFB Y12, Y10, Y10
+ XOR3WAY( $0x00, Y11, Y13, Y3)
+ XOR3WAY( $0x00, Y9, Y10, Y4)
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPXOR Y1, Y3, Y3
+ VPXOR Y2, Y4, Y4
+ VMOVDQU Y1, (DI)
+ VMOVDQU Y2, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y3, (R8)
+ VMOVDQU Y4, 32(R8)
+ ADDQ $0x40, R8
+ VPXOR Y5, Y7, Y7
+ VPXOR Y6, Y8, Y8
+ VMOVDQU Y5, (R9)
+ VMOVDQU Y6, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y7, (CX)
+ VMOVDQU Y8, 32(CX)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, SSE2
+TEXT ·ifftDIT4_avx2_7(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), AX
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y0
+ VMOVDQU 32(SI), Y1
+ VMOVDQU (DI), Y2
+ VMOVDQU 32(DI), Y3
+ VPXOR Y0, Y2, Y2
+ VPXOR Y1, Y3, Y3
+ VMOVDQU (R8), Y4
+ VMOVDQU 32(R8), Y5
+ VMOVDQU (AX), Y6
+ VMOVDQU 32(AX), Y7
+ VPXOR Y4, Y6, Y6
+ VPXOR Y5, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y2, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y4, (R8)
+ VMOVDQU Y5, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y6, (AX)
+ VMOVDQU Y7, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func fftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8)
+// Requires: AVX, AVX2, SSE2
+TEXT ·fftDIT4_avx2_7(SB), NOSPLIT, $0-56
+ // dist must be multiplied by 24 (size of slice header)
+ MOVQ table01+32(FP), AX
+ MOVQ table23+40(FP), AX
+ MOVQ table02+48(FP), AX
+ MOVQ $0x0000000f, AX
+ MOVQ AX, X0
+ VPBROADCASTB X0, Y0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU (SI), Y0
+ VMOVDQU 32(SI), Y1
+ VMOVDQU (R8), Y4
+ VMOVDQU 32(R8), Y5
+ VMOVDQU (DI), Y2
+ VMOVDQU 32(DI), Y3
+ VMOVDQU (AX), Y6
+ VMOVDQU 32(AX), Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VPXOR Y0, Y2, Y2
+ VPXOR Y1, Y3, Y3
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y2, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+ VPXOR Y4, Y6, Y6
+ VPXOR Y5, Y7, Y7
+ VMOVDQU Y4, (R8)
+ VMOVDQU Y5, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y6, (AX)
+ VMOVDQU Y7, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JNZ loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8)
+// Requires: SSE, SSE2, SSSE3
+TEXT ·ifftDIT2_ssse3(SB), NOSPLIT, $0-56
+ MOVQ table+48(FP), AX
+ MOVUPS (AX), X0
+ MOVUPS 64(AX), X1
+ MOVUPS 16(AX), X2
+ MOVUPS 80(AX), X3
+ MOVUPS 32(AX), X4
+ MOVUPS 96(AX), X5
+ XORPS X6, X6
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X7
+ PSHUFB X6, X7
+ MOVQ x_len+8(FP), CX
+ MOVQ x_base+0(FP), DX
+ MOVQ y_base+24(FP), BX
+
+loop:
+ MOVUPS (DX), X6
+ MOVUPS 32(DX), X8
+ MOVUPS (BX), X9
+ MOVUPS 32(BX), X10
+ PXOR X6, X9
+ PXOR X8, X10
+ MOVUPS X9, (BX)
+ MOVUPS X10, 32(BX)
+ MOVAPS X9, X11
+ PSRLQ $0x04, X11
+ MOVAPS X9, X9
+ PAND X7, X9
+ PAND X7, X11
+ MOVUPS X0, X12
+ MOVUPS X1, X13
+ PSHUFB X9, X12
+ PSHUFB X9, X13
+ MOVUPS X2, X9
+ MOVUPS X3, X14
+ PSHUFB X11, X9
+ PSHUFB X11, X14
+ PXOR X9, X12
+ PXOR X14, X13
+ MOVAPS X10, X9
+ MOVAPS X10, X10
+ PAND X7, X9
+ PSRLQ $0x04, X10
+ PAND X7, X10
+ MOVUPS X4, X11
+ MOVUPS X5, X14
+ PSHUFB X9, X11
+ PSHUFB X9, X14
+ PXOR X11, X12
+ PXOR X14, X13
+ MOVUPS 48(AX), X11
+ MOVUPS 112(AX), X14
+ PSHUFB X10, X11
+ PSHUFB X10, X14
+ PXOR X11, X12
+ PXOR X14, X13
+ PXOR X12, X6
+ PXOR X13, X8
+ MOVUPS X6, (DX)
+ MOVUPS X8, 32(DX)
+ MOVUPS 16(DX), X6
+ MOVUPS 48(DX), X8
+ MOVUPS 16(BX), X9
+ MOVUPS 48(BX), X10
+ PXOR X6, X9
+ PXOR X8, X10
+ MOVUPS X9, 16(BX)
+ MOVUPS X10, 48(BX)
+ MOVAPS X9, X11
+ PSRLQ $0x04, X11
+ MOVAPS X9, X9
+ PAND X7, X9
+ PAND X7, X11
+ MOVUPS X0, X12
+ MOVUPS X1, X13
+ PSHUFB X9, X12
+ PSHUFB X9, X13
+ MOVUPS X2, X9
+ MOVUPS X3, X14
+ PSHUFB X11, X9
+ PSHUFB X11, X14
+ PXOR X9, X12
+ PXOR X14, X13
+ MOVAPS X10, X9
+ MOVAPS X10, X10
+ PAND X7, X9
+ PSRLQ $0x04, X10
+ PAND X7, X10
+ MOVUPS X4, X11
+ MOVUPS X5, X14
+ PSHUFB X9, X11
+ PSHUFB X9, X14
+ PXOR X11, X12
+ PXOR X14, X13
+ MOVUPS 48(AX), X11
+ MOVUPS 112(AX), X14
+ PSHUFB X10, X11
+ PSHUFB X10, X14
+ PXOR X11, X12
+ PXOR X14, X13
+ PXOR X12, X6
+ PXOR X13, X8
+ MOVUPS X6, 16(DX)
+ MOVUPS X8, 48(DX)
+ ADDQ $0x40, DX
+ ADDQ $0x40, BX
+ SUBQ $0x40, CX
+ JNZ loop
+ RET
+
+// func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8)
+// Requires: SSE, SSE2, SSSE3
+TEXT ·fftDIT2_ssse3(SB), NOSPLIT, $0-56
+ MOVQ table+48(FP), AX
+ MOVUPS (AX), X0
+ MOVUPS 64(AX), X1
+ MOVUPS 16(AX), X2
+ MOVUPS 80(AX), X3
+ MOVUPS 32(AX), X4
+ MOVUPS 96(AX), X5
+ XORPS X6, X6
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X7
+ PSHUFB X6, X7
+ MOVQ x_len+8(FP), CX
+ MOVQ x_base+0(FP), DX
+ MOVQ y_base+24(FP), BX
+
+loop:
+ MOVUPS (BX), X9
+ MOVUPS 32(BX), X10
+ MOVAPS X9, X8
+ PSRLQ $0x04, X8
+ MOVAPS X9, X6
+ PAND X7, X6
+ PAND X7, X8
+ MOVUPS X0, X11
+ MOVUPS X1, X12
+ PSHUFB X6, X11
+ PSHUFB X6, X12
+ MOVUPS X2, X6
+ MOVUPS X3, X13
+ PSHUFB X8, X6
+ PSHUFB X8, X13
+ PXOR X6, X11
+ PXOR X13, X12
+ MOVAPS X10, X6
+ MOVAPS X10, X8
+ PAND X7, X6
+ PSRLQ $0x04, X8
+ PAND X7, X8
+ MOVUPS X4, X13
+ MOVUPS X5, X14
+ PSHUFB X6, X13
+ PSHUFB X6, X14
+ PXOR X13, X11
+ PXOR X14, X12
+ MOVUPS 48(AX), X13
+ MOVUPS 112(AX), X14
+ PSHUFB X8, X13
+ PSHUFB X8, X14
+ PXOR X13, X11
+ PXOR X14, X12
+ MOVUPS (DX), X6
+ MOVUPS 32(DX), X8
+ PXOR X11, X6
+ PXOR X12, X8
+ MOVUPS X6, (DX)
+ MOVUPS X8, 32(DX)
+ PXOR X6, X9
+ PXOR X8, X10
+ MOVUPS X9, (BX)
+ MOVUPS X10, 32(BX)
+ MOVUPS 16(BX), X9
+ MOVUPS 48(BX), X10
+ MOVAPS X9, X8
+ PSRLQ $0x04, X8
+ MOVAPS X9, X6
+ PAND X7, X6
+ PAND X7, X8
+ MOVUPS X0, X11
+ MOVUPS X1, X12
+ PSHUFB X6, X11
+ PSHUFB X6, X12
+ MOVUPS X2, X6
+ MOVUPS X3, X13
+ PSHUFB X8, X6
+ PSHUFB X8, X13
+ PXOR X6, X11
+ PXOR X13, X12
+ MOVAPS X10, X6
+ MOVAPS X10, X8
+ PAND X7, X6
+ PSRLQ $0x04, X8
+ PAND X7, X8
+ MOVUPS X4, X13
+ MOVUPS X5, X14
+ PSHUFB X6, X13
+ PSHUFB X6, X14
+ PXOR X13, X11
+ PXOR X14, X12
+ MOVUPS 48(AX), X13
+ MOVUPS 112(AX), X14
+ PSHUFB X8, X13
+ PSHUFB X8, X14
+ PXOR X13, X11
+ PXOR X14, X12
+ MOVUPS 16(DX), X6
+ MOVUPS 48(DX), X8
+ PXOR X11, X6
+ PXOR X12, X8
+ MOVUPS X6, 16(DX)
+ MOVUPS X8, 48(DX)
+ PXOR X6, X9
+ PXOR X8, X10
+ MOVUPS X9, 16(BX)
+ MOVUPS X10, 48(BX)
+ ADDQ $0x40, DX
+ ADDQ $0x40, BX
+ SUBQ $0x40, CX
+ JNZ loop
+ RET
+
+// func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8)
+// Requires: SSE, SSE2, SSSE3
+TEXT ·mulgf16_ssse3(SB), NOSPLIT, $0-56
+ MOVQ table+48(FP), AX
+ MOVUPS (AX), X0
+ MOVUPS 64(AX), X1
+ MOVUPS 16(AX), X2
+ MOVUPS 80(AX), X3
+ MOVUPS 32(AX), X4
+ MOVUPS 96(AX), X5
+ MOVUPS 48(AX), X6
+ MOVUPS 112(AX), X7
+ MOVQ x_len+8(FP), AX
+ MOVQ x_base+0(FP), CX
+ MOVQ y_base+24(FP), DX
+ XORPS X8, X8
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X9
+ PSHUFB X8, X9
+
+loop:
+ MOVUPS (DX), X8
+ MOVUPS 32(DX), X10
+ MOVAPS X8, X11
+ PSRLQ $0x04, X11
+ MOVAPS X8, X8
+ PAND X9, X8
+ PAND X9, X11
+ MOVUPS X0, X12
+ MOVUPS X1, X13
+ PSHUFB X8, X12
+ PSHUFB X8, X13
+ MOVUPS X2, X8
+ MOVUPS X3, X14
+ PSHUFB X11, X8
+ PSHUFB X11, X14
+ PXOR X8, X12
+ PXOR X14, X13
+ MOVAPS X10, X8
+ MOVAPS X10, X10
+ PAND X9, X8
+ PSRLQ $0x04, X10
+ PAND X9, X10
+ MOVUPS X4, X11
+ MOVUPS X5, X14
+ PSHUFB X8, X11
+ PSHUFB X8, X14
+ PXOR X11, X12
+ PXOR X14, X13
+ MOVUPS X6, X11
+ MOVUPS X7, X14
+ PSHUFB X10, X11
+ PSHUFB X10, X14
+ PXOR X11, X12
+ PXOR X14, X13
+ MOVUPS X12, (CX)
+ MOVUPS X13, 32(CX)
+ MOVUPS 16(DX), X8
+ MOVUPS 48(DX), X10
+ MOVAPS X8, X11
+ PSRLQ $0x04, X11
+ MOVAPS X8, X8
+ PAND X9, X8
+ PAND X9, X11
+ MOVUPS X0, X12
+ MOVUPS X1, X13
+ PSHUFB X8, X12
+ PSHUFB X8, X13
+ MOVUPS X2, X8
+ MOVUPS X3, X14
+ PSHUFB X11, X8
+ PSHUFB X11, X14
+ PXOR X8, X12
+ PXOR X14, X13
+ MOVAPS X10, X8
+ MOVAPS X10, X10
+ PAND X9, X8
+ PSRLQ $0x04, X10
+ PAND X9, X10
+ MOVUPS X4, X11
+ MOVUPS X5, X14
+ PSHUFB X8, X11
+ PSHUFB X8, X14
+ PXOR X11, X12
+ PXOR X14, X13
+ MOVUPS X6, X11
+ MOVUPS X7, X14
+ PSHUFB X10, X11
+ PSHUFB X10, X14
+ PXOR X11, X12
+ PXOR X14, X13
+ MOVUPS X12, 16(CX)
+ MOVUPS X13, 48(CX)
+ ADDQ $0x40, CX
+ ADDQ $0x40, DX
+ SUBQ $0x40, AX
+ JNZ loop
+ RET
+
+// func ifftDIT28_avx2(x []byte, y []byte, table *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT28_avx2(SB), NOSPLIT, $0-56
+ MOVQ table+48(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ x_len+8(FP), AX
+ MOVQ x_base+0(FP), CX
+ MOVQ y_base+24(FP), DX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X2
+ VPBROADCASTB X2, Y2
+
+loop:
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VMOVDQU (DX), Y5
+ VMOVDQU 32(DX), Y6
+ VPXOR Y5, Y3, Y5
+ VPXOR Y6, Y4, Y6
+ VMOVDQU Y5, (DX)
+ VMOVDQU Y6, 32(DX)
+
+ // LEO_MULADD_256
+ VPAND Y5, Y2, Y7
+ VPSRLQ $0x04, Y5, Y5
+ VPSHUFB Y7, Y0, Y7
+ VPAND Y5, Y2, Y5
+ VPSHUFB Y5, Y1, Y5
+ XOR3WAY( $0x00, Y7, Y5, Y3)
+
+ // LEO_MULADD_256
+ VPAND Y6, Y2, Y5
+ VPSRLQ $0x04, Y6, Y6
+ VPSHUFB Y5, Y0, Y5
+ VPAND Y6, Y2, Y6
+ VPSHUFB Y6, Y1, Y6
+ XOR3WAY( $0x00, Y5, Y6, Y4)
+ VMOVDQU Y3, (CX)
+ VMOVDQU Y4, 32(CX)
+ ADDQ $0x40, CX
+ ADDQ $0x40, DX
+ SUBQ $0x40, AX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT28_avx2(x []byte, y []byte, table *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT28_avx2(SB), NOSPLIT, $0-56
+ MOVQ table+48(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ x_len+8(FP), AX
+ MOVQ x_base+0(FP), CX
+ MOVQ y_base+24(FP), DX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X2
+ VPBROADCASTB X2, Y2
+
+loop:
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y4
+ VMOVDQU (DX), Y5
+ VMOVDQU 32(DX), Y6
+
+ // LEO_MULADD_256
+ VPAND Y5, Y2, Y7
+ VPSRLQ $0x04, Y5, Y8
+ VPSHUFB Y7, Y0, Y7
+ VPAND Y8, Y2, Y8
+ VPSHUFB Y8, Y1, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // LEO_MULADD_256
+ VPAND Y6, Y2, Y7
+ VPSRLQ $0x04, Y6, Y8
+ VPSHUFB Y7, Y0, Y7
+ VPAND Y8, Y2, Y8
+ VPSHUFB Y8, Y1, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y4)
+ VMOVDQU Y3, (CX)
+ VMOVDQU Y4, 32(CX)
+ VPXOR Y5, Y3, Y5
+ VPXOR Y6, Y4, Y6
+ VMOVDQU Y5, (DX)
+ VMOVDQU Y6, 32(DX)
+ ADDQ $0x40, CX
+ ADDQ $0x40, DX
+ SUBQ $0x40, AX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT48_avx2_0(SB), NOSPLIT, $0-56
+ MOVQ t01+32(FP), AX
+ VBROADCASTI128 16(AX), Y0
+ MOVQ t23+40(FP), CX
+ VBROADCASTI128 (CX), Y1
+ VBROADCASTI128 16(CX), Y2
+ MOVQ t02+48(FP), CX
+ VBROADCASTI128 (CX), Y3
+ VBROADCASTI128 16(CX), Y4
+ MOVQ dist+24(FP), CX
+ MOVQ work_base+0(FP), DX
+ MOVQ 8(DX), BX
+ XORQ SI, SI
+ MOVQ (DX)(SI*1), DI
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R8
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), R9
+ ADDQ CX, SI
+ MOVQ (DX)(SI*1), CX
+ MOVQ $0x0000000f, DX
+ MOVQ DX, X5
+ VPBROADCASTB X5, Y5
+
+loop:
+ VMOVDQU (DI), Y6
+ VMOVDQU (R8), Y7
+ VMOVDQU 32(DI), Y8
+ VMOVDQU 32(R8), Y9
+ VPXOR Y7, Y6, Y7
+ VPXOR Y9, Y8, Y9
+ VBROADCASTI128 (AX), Y10
+
+ // LEO_MULADD_256
+ VPAND Y7, Y5, Y11
+ VPSRLQ $0x04, Y7, Y12
+ VPSHUFB Y11, Y10, Y11
+ VPAND Y12, Y5, Y12
+ VPSHUFB Y12, Y0, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+
+ // LEO_MULADD_256
+ VPAND Y9, Y5, Y11
+ VPSRLQ $0x04, Y9, Y12
+ VPSHUFB Y11, Y10, Y11
+ VPAND Y12, Y5, Y12
+ VPSHUFB Y12, Y0, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VMOVDQU (R9), Y10
+ VMOVDQU (CX), Y11
+ VMOVDQU 32(R9), Y12
+ VMOVDQU 32(CX), Y13
+ VPXOR Y10, Y11, Y11
+ VPXOR Y12, Y13, Y13
+
+ // LEO_MULADD_256
+ VPAND Y11, Y5, Y14
+ VPSRLQ $0x04, Y11, Y15
+ VPSHUFB Y14, Y1, Y14
+ VPAND Y15, Y5, Y15
+ VPSHUFB Y15, Y2, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y10)
+
+ // LEO_MULADD_256
+ VPAND Y13, Y5, Y14
+ VPSRLQ $0x04, Y13, Y15
+ VPSHUFB Y14, Y1, Y14
+ VPAND Y15, Y5, Y15
+ VPSHUFB Y15, Y2, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y12)
+ VPXOR Y6, Y10, Y10
+ VPXOR Y7, Y11, Y11
+ VPXOR Y8, Y12, Y12
+ VPXOR Y9, Y13, Y13
+
+ // LEO_MULADD_256
+ VPAND Y10, Y5, Y14
+ VPSRLQ $0x04, Y10, Y15
+ VPSHUFB Y14, Y3, Y14
+ VPAND Y15, Y5, Y15
+ VPSHUFB Y15, Y4, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y6)
+
+ // LEO_MULADD_256
+ VPAND Y11, Y5, Y14
+ VPSRLQ $0x04, Y11, Y15
+ VPSHUFB Y14, Y3, Y14
+ VPAND Y15, Y5, Y15
+ VPSHUFB Y15, Y4, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y7)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y5, Y14
+ VPSRLQ $0x04, Y12, Y15
+ VPSHUFB Y14, Y3, Y14
+ VPAND Y15, Y5, Y15
+ VPSHUFB Y15, Y4, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y8)
+
+ // LEO_MULADD_256
+ VPAND Y13, Y5, Y14
+ VPSRLQ $0x04, Y13, Y15
+ VPSHUFB Y14, Y3, Y14
+ VPAND Y15, Y5, Y15
+ VPSHUFB Y15, Y4, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y9)
+ VMOVDQU Y6, (DI)
+ VMOVDQU Y8, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y7, (R8)
+ VMOVDQU Y9, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y10, (R9)
+ VMOVDQU Y12, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y11, (CX)
+ VMOVDQU Y13, 32(CX)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT48_avx2_0(SB), NOSPLIT, $0-56
+ MOVQ t01+32(FP), AX
+ VBROADCASTI128 16(AX), Y0
+ MOVQ t23+40(FP), CX
+ VBROADCASTI128 16(CX), Y1
+ MOVQ t02+48(FP), DX
+ VBROADCASTI128 (DX), Y2
+ VBROADCASTI128 16(DX), Y3
+ MOVQ dist+24(FP), DX
+ MOVQ work_base+0(FP), BX
+ MOVQ 8(BX), SI
+ XORQ DI, DI
+ MOVQ (BX)(DI*1), R8
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R9
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), R10
+ ADDQ DX, DI
+ MOVQ (BX)(DI*1), DX
+ MOVQ $0x0000000f, BX
+ MOVQ BX, X4
+ VPBROADCASTB X4, Y4
+
+loop:
+ VMOVDQU (R8), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU (R10), Y9
+ VMOVDQU 32(R10), Y10
+ VMOVDQU (R9), Y7
+ VMOVDQU 32(R9), Y8
+ VMOVDQU (DX), Y11
+ VMOVDQU 32(DX), Y12
+
+ // LEO_MULADD_256
+ VPAND Y9, Y4, Y13
+ VPSRLQ $0x04, Y9, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y10, Y4, Y13
+ VPSRLQ $0x04, Y10, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y6)
+
+ // LEO_MULADD_256
+ VPAND Y11, Y4, Y13
+ VPSRLQ $0x04, Y11, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y7)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y13
+ VPSRLQ $0x04, Y12, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y8)
+ VPXOR Y5, Y9, Y9
+ VPXOR Y7, Y11, Y11
+ VPXOR Y6, Y10, Y10
+ VPXOR Y8, Y12, Y12
+ VBROADCASTI128 (AX), Y13
+
+ // LEO_MULADD_256
+ VPAND Y7, Y4, Y14
+ VPSRLQ $0x04, Y7, Y15
+ VPSHUFB Y14, Y13, Y14
+ VPAND Y15, Y4, Y15
+ VPSHUFB Y15, Y0, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y8, Y4, Y14
+ VPSRLQ $0x04, Y8, Y15
+ VPSHUFB Y14, Y13, Y14
+ VPAND Y15, Y4, Y15
+ VPSHUFB Y15, Y0, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y6)
+ VPXOR Y7, Y5, Y7
+ VPXOR Y8, Y6, Y8
+ VBROADCASTI128 (CX), Y13
+
+ // LEO_MULADD_256
+ VPAND Y11, Y4, Y14
+ VPSRLQ $0x04, Y11, Y15
+ VPSHUFB Y14, Y13, Y14
+ VPAND Y15, Y4, Y15
+ VPSHUFB Y15, Y1, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y9)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y14
+ VPSRLQ $0x04, Y12, Y15
+ VPSHUFB Y14, Y13, Y14
+ VPAND Y15, Y4, Y15
+ VPSHUFB Y15, Y1, Y15
+ XOR3WAY( $0x00, Y14, Y15, Y10)
+ VPXOR Y9, Y11, Y11
+ VPXOR Y10, Y12, Y12
+ VMOVDQU Y5, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y7, (R9)
+ VMOVDQU Y8, 32(R9)
+ ADDQ $0x40, R9
+ VMOVDQU Y9, (R10)
+ VMOVDQU Y10, 32(R10)
+ ADDQ $0x40, R10
+ VMOVDQU Y11, (DX)
+ VMOVDQU Y12, 32(DX)
+ ADDQ $0x40, DX
+ SUBQ $0x40, SI
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT48_avx2_1(SB), NOSPLIT, $0-56
+ MOVQ t23+40(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ t02+48(FP), AX
+ VBROADCASTI128 (AX), Y2
+ VBROADCASTI128 16(AX), Y3
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X4
+ VPBROADCASTB X4, Y4
+
+loop:
+ VMOVDQU (SI), Y5
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(SI), Y7
+ VMOVDQU 32(DI), Y8
+ VPXOR Y6, Y5, Y6
+ VPXOR Y8, Y7, Y8
+ VMOVDQU (R8), Y9
+ VMOVDQU (AX), Y10
+ VMOVDQU 32(R8), Y11
+ VMOVDQU 32(AX), Y12
+ VPXOR Y9, Y10, Y10
+ VPXOR Y11, Y12, Y12
+
+ // LEO_MULADD_256
+ VPAND Y10, Y4, Y13
+ VPSRLQ $0x04, Y10, Y14
+ VPSHUFB Y13, Y0, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y1, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y9)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y13
+ VPSRLQ $0x04, Y12, Y14
+ VPSHUFB Y13, Y0, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y1, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y11)
+ VPXOR Y5, Y9, Y9
+ VPXOR Y6, Y10, Y10
+ VPXOR Y7, Y11, Y11
+ VPXOR Y8, Y12, Y12
+
+ // LEO_MULADD_256
+ VPAND Y9, Y4, Y13
+ VPSRLQ $0x04, Y9, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y10, Y4, Y13
+ VPSRLQ $0x04, Y10, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y6)
+
+ // LEO_MULADD_256
+ VPAND Y11, Y4, Y13
+ VPSRLQ $0x04, Y11, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y7)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y13
+ VPSRLQ $0x04, Y12, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y8)
+ VMOVDQU Y5, (SI)
+ VMOVDQU Y7, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y6, (DI)
+ VMOVDQU Y8, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y9, (R8)
+ VMOVDQU Y11, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y10, (AX)
+ VMOVDQU Y12, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT48_avx2_1(SB), NOSPLIT, $0-56
+ MOVQ t01+32(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ t23+40(FP), AX
+ VBROADCASTI128 (AX), Y2
+ VBROADCASTI128 16(AX), Y3
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X4
+ VPBROADCASTB X4, Y4
+
+loop:
+ VMOVDQU (SI), Y5
+ VMOVDQU 32(SI), Y6
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y10
+ VMOVDQU (DI), Y7
+ VMOVDQU 32(DI), Y8
+ VMOVDQU (AX), Y11
+ VMOVDQU 32(AX), Y12
+ VPXOR Y5, Y9, Y9
+ VPXOR Y7, Y11, Y11
+ VPXOR Y6, Y10, Y10
+ VPXOR Y8, Y12, Y12
+
+ // LEO_MULADD_256
+ VPAND Y7, Y4, Y13
+ VPSRLQ $0x04, Y7, Y14
+ VPSHUFB Y13, Y0, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y1, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y8, Y4, Y13
+ VPSRLQ $0x04, Y8, Y14
+ VPSHUFB Y13, Y0, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y1, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y6)
+ VPXOR Y7, Y5, Y7
+ VPXOR Y8, Y6, Y8
+
+ // LEO_MULADD_256
+ VPAND Y11, Y4, Y13
+ VPSRLQ $0x04, Y11, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y9)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y13
+ VPSRLQ $0x04, Y12, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y10)
+ VPXOR Y9, Y11, Y11
+ VPXOR Y10, Y12, Y12
+ VMOVDQU Y5, (SI)
+ VMOVDQU Y6, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y7, (DI)
+ VMOVDQU Y8, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y9, (R8)
+ VMOVDQU Y10, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y11, (AX)
+ VMOVDQU Y12, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT48_avx2_2(SB), NOSPLIT, $0-56
+ MOVQ t01+32(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ t02+48(FP), AX
+ VBROADCASTI128 (AX), Y2
+ VBROADCASTI128 16(AX), Y3
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X4
+ VPBROADCASTB X4, Y4
+
+loop:
+ VMOVDQU (SI), Y5
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(SI), Y7
+ VMOVDQU 32(DI), Y8
+ VPXOR Y6, Y5, Y6
+ VPXOR Y8, Y7, Y8
+
+ // LEO_MULADD_256
+ VPAND Y6, Y4, Y9
+ VPSRLQ $0x04, Y6, Y10
+ VPSHUFB Y9, Y0, Y9
+ VPAND Y10, Y4, Y10
+ VPSHUFB Y10, Y1, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y8, Y4, Y9
+ VPSRLQ $0x04, Y8, Y10
+ VPSHUFB Y9, Y0, Y9
+ VPAND Y10, Y4, Y10
+ VPSHUFB Y10, Y1, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+ VMOVDQU (R8), Y9
+ VMOVDQU (AX), Y10
+ VMOVDQU 32(R8), Y11
+ VMOVDQU 32(AX), Y12
+ VPXOR Y9, Y10, Y10
+ VPXOR Y11, Y12, Y12
+ VPXOR Y5, Y9, Y9
+ VPXOR Y6, Y10, Y10
+ VPXOR Y7, Y11, Y11
+ VPXOR Y8, Y12, Y12
+
+ // LEO_MULADD_256
+ VPAND Y9, Y4, Y13
+ VPSRLQ $0x04, Y9, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y10, Y4, Y13
+ VPSRLQ $0x04, Y10, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y6)
+
+ // LEO_MULADD_256
+ VPAND Y11, Y4, Y13
+ VPSRLQ $0x04, Y11, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y7)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y13
+ VPSRLQ $0x04, Y12, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y8)
+ VMOVDQU Y5, (SI)
+ VMOVDQU Y7, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y6, (DI)
+ VMOVDQU Y8, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y9, (R8)
+ VMOVDQU Y11, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y10, (AX)
+ VMOVDQU Y12, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT48_avx2_2(SB), NOSPLIT, $0-56
+ MOVQ t23+40(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ t02+48(FP), AX
+ VBROADCASTI128 (AX), Y2
+ VBROADCASTI128 16(AX), Y3
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X4
+ VPBROADCASTB X4, Y4
+
+loop:
+ VMOVDQU (SI), Y5
+ VMOVDQU 32(SI), Y6
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y10
+ VMOVDQU (DI), Y7
+ VMOVDQU 32(DI), Y8
+ VMOVDQU (AX), Y11
+ VMOVDQU 32(AX), Y12
+
+ // LEO_MULADD_256
+ VPAND Y9, Y4, Y13
+ VPSRLQ $0x04, Y9, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y10, Y4, Y13
+ VPSRLQ $0x04, Y10, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y6)
+
+ // LEO_MULADD_256
+ VPAND Y11, Y4, Y13
+ VPSRLQ $0x04, Y11, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y7)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y13
+ VPSRLQ $0x04, Y12, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y8)
+ VPXOR Y5, Y9, Y9
+ VPXOR Y7, Y11, Y11
+ VPXOR Y6, Y10, Y10
+ VPXOR Y8, Y12, Y12
+ VPXOR Y7, Y5, Y7
+ VPXOR Y8, Y6, Y8
+
+ // LEO_MULADD_256
+ VPAND Y11, Y4, Y13
+ VPSRLQ $0x04, Y11, Y14
+ VPSHUFB Y13, Y0, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y1, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y9)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y13
+ VPSRLQ $0x04, Y12, Y14
+ VPSHUFB Y13, Y0, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y1, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y10)
+ VPXOR Y9, Y11, Y11
+ VPXOR Y10, Y12, Y12
+ VMOVDQU Y5, (SI)
+ VMOVDQU Y6, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y7, (DI)
+ VMOVDQU Y8, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y9, (R8)
+ VMOVDQU Y10, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y11, (AX)
+ VMOVDQU Y12, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT48_avx2_3(SB), NOSPLIT, $0-56
+ MOVQ t02+48(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X2
+ VPBROADCASTB X2, Y2
+
+loop:
+ VMOVDQU (SI), Y3
+ VMOVDQU (DI), Y4
+ VMOVDQU 32(SI), Y5
+ VMOVDQU 32(DI), Y6
+ VPXOR Y4, Y3, Y4
+ VPXOR Y6, Y5, Y6
+ VMOVDQU (R8), Y7
+ VMOVDQU (AX), Y8
+ VMOVDQU 32(R8), Y9
+ VMOVDQU 32(AX), Y10
+ VPXOR Y7, Y8, Y8
+ VPXOR Y9, Y10, Y10
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPXOR Y5, Y9, Y9
+ VPXOR Y6, Y10, Y10
+
+ // LEO_MULADD_256
+ VPAND Y7, Y2, Y11
+ VPSRLQ $0x04, Y7, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+
+ // LEO_MULADD_256
+ VPAND Y8, Y2, Y11
+ VPSRLQ $0x04, Y8, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+
+ // LEO_MULADD_256
+ VPAND Y9, Y2, Y11
+ VPSRLQ $0x04, Y9, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y10, Y2, Y11
+ VPSRLQ $0x04, Y10, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VMOVDQU Y3, (SI)
+ VMOVDQU Y5, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y4, (DI)
+ VMOVDQU Y6, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y7, (R8)
+ VMOVDQU Y9, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y8, (AX)
+ VMOVDQU Y10, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT48_avx2_3(SB), NOSPLIT, $0-56
+ MOVQ t23+40(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X2
+ VPBROADCASTB X2, Y2
+
+loop:
+ VMOVDQU (SI), Y3
+ VMOVDQU 32(SI), Y4
+ VMOVDQU (R8), Y7
+ VMOVDQU 32(R8), Y8
+ VMOVDQU (DI), Y5
+ VMOVDQU 32(DI), Y6
+ VMOVDQU (AX), Y9
+ VMOVDQU 32(AX), Y10
+ VPXOR Y3, Y7, Y7
+ VPXOR Y5, Y9, Y9
+ VPXOR Y4, Y8, Y8
+ VPXOR Y6, Y10, Y10
+ VPXOR Y5, Y3, Y5
+ VPXOR Y6, Y4, Y6
+
+ // LEO_MULADD_256
+ VPAND Y9, Y2, Y11
+ VPSRLQ $0x04, Y9, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+
+ // LEO_MULADD_256
+ VPAND Y10, Y2, Y11
+ VPSRLQ $0x04, Y10, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y8)
+ VPXOR Y7, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VMOVDQU Y3, (SI)
+ VMOVDQU Y4, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y5, (DI)
+ VMOVDQU Y6, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y7, (R8)
+ VMOVDQU Y8, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y9, (AX)
+ VMOVDQU Y10, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT48_avx2_4(SB), NOSPLIT, $0-56
+ MOVQ t01+32(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ t23+40(FP), AX
+ VBROADCASTI128 (AX), Y2
+ VBROADCASTI128 16(AX), Y3
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X4
+ VPBROADCASTB X4, Y4
+
+loop:
+ VMOVDQU (SI), Y5
+ VMOVDQU (DI), Y6
+ VMOVDQU 32(SI), Y7
+ VMOVDQU 32(DI), Y8
+ VPXOR Y6, Y5, Y6
+ VPXOR Y8, Y7, Y8
+
+ // LEO_MULADD_256
+ VPAND Y6, Y4, Y9
+ VPSRLQ $0x04, Y6, Y10
+ VPSHUFB Y9, Y0, Y9
+ VPAND Y10, Y4, Y10
+ VPSHUFB Y10, Y1, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y8, Y4, Y9
+ VPSRLQ $0x04, Y8, Y10
+ VPSHUFB Y9, Y0, Y9
+ VPAND Y10, Y4, Y10
+ VPSHUFB Y10, Y1, Y10
+ XOR3WAY( $0x00, Y9, Y10, Y7)
+ VMOVDQU (R8), Y9
+ VMOVDQU (AX), Y10
+ VMOVDQU 32(R8), Y11
+ VMOVDQU 32(AX), Y12
+ VPXOR Y9, Y10, Y10
+ VPXOR Y11, Y12, Y12
+
+ // LEO_MULADD_256
+ VPAND Y10, Y4, Y13
+ VPSRLQ $0x04, Y10, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y9)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y13
+ VPSRLQ $0x04, Y12, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y11)
+ VPXOR Y5, Y9, Y9
+ VPXOR Y6, Y10, Y10
+ VPXOR Y7, Y11, Y11
+ VPXOR Y8, Y12, Y12
+ VMOVDQU Y5, (SI)
+ VMOVDQU Y7, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y6, (DI)
+ VMOVDQU Y8, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y9, (R8)
+ VMOVDQU Y11, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y10, (AX)
+ VMOVDQU Y12, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT48_avx2_4(SB), NOSPLIT, $0-56
+ MOVQ t01+32(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ t02+48(FP), AX
+ VBROADCASTI128 (AX), Y2
+ VBROADCASTI128 16(AX), Y3
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X4
+ VPBROADCASTB X4, Y4
+
+loop:
+ VMOVDQU (SI), Y5
+ VMOVDQU 32(SI), Y6
+ VMOVDQU (R8), Y9
+ VMOVDQU 32(R8), Y10
+ VMOVDQU (DI), Y7
+ VMOVDQU 32(DI), Y8
+ VMOVDQU (AX), Y11
+ VMOVDQU 32(AX), Y12
+
+ // LEO_MULADD_256
+ VPAND Y9, Y4, Y13
+ VPSRLQ $0x04, Y9, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y10, Y4, Y13
+ VPSRLQ $0x04, Y10, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y6)
+
+ // LEO_MULADD_256
+ VPAND Y11, Y4, Y13
+ VPSRLQ $0x04, Y11, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y7)
+
+ // LEO_MULADD_256
+ VPAND Y12, Y4, Y13
+ VPSRLQ $0x04, Y12, Y14
+ VPSHUFB Y13, Y2, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y3, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y8)
+ VPXOR Y5, Y9, Y9
+ VPXOR Y7, Y11, Y11
+ VPXOR Y6, Y10, Y10
+ VPXOR Y8, Y12, Y12
+
+ // LEO_MULADD_256
+ VPAND Y7, Y4, Y13
+ VPSRLQ $0x04, Y7, Y14
+ VPSHUFB Y13, Y0, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y1, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y8, Y4, Y13
+ VPSRLQ $0x04, Y8, Y14
+ VPSHUFB Y13, Y0, Y13
+ VPAND Y14, Y4, Y14
+ VPSHUFB Y14, Y1, Y14
+ XOR3WAY( $0x00, Y13, Y14, Y6)
+ VPXOR Y7, Y5, Y7
+ VPXOR Y8, Y6, Y8
+ VPXOR Y9, Y11, Y11
+ VPXOR Y10, Y12, Y12
+ VMOVDQU Y5, (SI)
+ VMOVDQU Y6, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y7, (DI)
+ VMOVDQU Y8, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y9, (R8)
+ VMOVDQU Y10, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y11, (AX)
+ VMOVDQU Y12, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT48_avx2_5(SB), NOSPLIT, $0-56
+ MOVQ t23+40(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X2
+ VPBROADCASTB X2, Y2
+
+loop:
+ VMOVDQU (SI), Y3
+ VMOVDQU (DI), Y4
+ VMOVDQU 32(SI), Y5
+ VMOVDQU 32(DI), Y6
+ VPXOR Y4, Y3, Y4
+ VPXOR Y6, Y5, Y6
+ VMOVDQU (R8), Y7
+ VMOVDQU (AX), Y8
+ VMOVDQU 32(R8), Y9
+ VMOVDQU 32(AX), Y10
+ VPXOR Y7, Y8, Y8
+ VPXOR Y9, Y10, Y10
+
+ // LEO_MULADD_256
+ VPAND Y8, Y2, Y11
+ VPSRLQ $0x04, Y8, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y7)
+
+ // LEO_MULADD_256
+ VPAND Y10, Y2, Y11
+ VPSRLQ $0x04, Y10, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y9)
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPXOR Y5, Y9, Y9
+ VPXOR Y6, Y10, Y10
+ VMOVDQU Y3, (SI)
+ VMOVDQU Y5, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y4, (DI)
+ VMOVDQU Y6, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y7, (R8)
+ VMOVDQU Y9, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y8, (AX)
+ VMOVDQU Y10, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT48_avx2_5(SB), NOSPLIT, $0-56
+ MOVQ t01+32(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X2
+ VPBROADCASTB X2, Y2
+
+loop:
+ VMOVDQU (SI), Y3
+ VMOVDQU 32(SI), Y4
+ VMOVDQU (R8), Y7
+ VMOVDQU 32(R8), Y8
+ VMOVDQU (DI), Y5
+ VMOVDQU 32(DI), Y6
+ VMOVDQU (AX), Y9
+ VMOVDQU 32(AX), Y10
+ VPXOR Y3, Y7, Y7
+ VPXOR Y5, Y9, Y9
+ VPXOR Y4, Y8, Y8
+ VPXOR Y6, Y10, Y10
+
+ // LEO_MULADD_256
+ VPAND Y5, Y2, Y11
+ VPSRLQ $0x04, Y5, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+
+ // LEO_MULADD_256
+ VPAND Y6, Y2, Y11
+ VPSRLQ $0x04, Y6, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+ VPXOR Y5, Y3, Y5
+ VPXOR Y6, Y4, Y6
+ VPXOR Y7, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VMOVDQU Y3, (SI)
+ VMOVDQU Y4, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y5, (DI)
+ VMOVDQU Y6, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y7, (R8)
+ VMOVDQU Y8, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y9, (AX)
+ VMOVDQU Y10, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·ifftDIT48_avx2_6(SB), NOSPLIT, $0-56
+ MOVQ t01+32(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X2
+ VPBROADCASTB X2, Y2
+
+loop:
+ VMOVDQU (SI), Y3
+ VMOVDQU (DI), Y4
+ VMOVDQU 32(SI), Y5
+ VMOVDQU 32(DI), Y6
+ VPXOR Y4, Y3, Y4
+ VPXOR Y6, Y5, Y6
+
+ // LEO_MULADD_256
+ VPAND Y4, Y2, Y7
+ VPSRLQ $0x04, Y4, Y8
+ VPSHUFB Y7, Y0, Y7
+ VPAND Y8, Y2, Y8
+ VPSHUFB Y8, Y1, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y3)
+
+ // LEO_MULADD_256
+ VPAND Y6, Y2, Y7
+ VPSRLQ $0x04, Y6, Y8
+ VPSHUFB Y7, Y0, Y7
+ VPAND Y8, Y2, Y8
+ VPSHUFB Y8, Y1, Y8
+ XOR3WAY( $0x00, Y7, Y8, Y5)
+ VMOVDQU (R8), Y7
+ VMOVDQU (AX), Y8
+ VMOVDQU 32(R8), Y9
+ VMOVDQU 32(AX), Y10
+ VPXOR Y7, Y8, Y8
+ VPXOR Y9, Y10, Y10
+ VPXOR Y3, Y7, Y7
+ VPXOR Y4, Y8, Y8
+ VPXOR Y5, Y9, Y9
+ VPXOR Y6, Y10, Y10
+ VMOVDQU Y3, (SI)
+ VMOVDQU Y5, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y4, (DI)
+ VMOVDQU Y6, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y7, (R8)
+ VMOVDQU Y9, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y8, (AX)
+ VMOVDQU Y10, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2
+TEXT ·fftDIT48_avx2_6(SB), NOSPLIT, $0-56
+ MOVQ t02+48(FP), AX
+ VBROADCASTI128 (AX), Y0
+ VBROADCASTI128 16(AX), Y1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X2
+ VPBROADCASTB X2, Y2
+
+loop:
+ VMOVDQU (SI), Y3
+ VMOVDQU 32(SI), Y4
+ VMOVDQU (R8), Y7
+ VMOVDQU 32(R8), Y8
+ VMOVDQU (DI), Y5
+ VMOVDQU 32(DI), Y6
+ VMOVDQU (AX), Y9
+ VMOVDQU 32(AX), Y10
+
+ // LEO_MULADD_256
+ VPAND Y7, Y2, Y11
+ VPSRLQ $0x04, Y7, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y3)
+
+ // LEO_MULADD_256
+ VPAND Y8, Y2, Y11
+ VPSRLQ $0x04, Y8, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y4)
+
+ // LEO_MULADD_256
+ VPAND Y9, Y2, Y11
+ VPSRLQ $0x04, Y9, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y5)
+
+ // LEO_MULADD_256
+ VPAND Y10, Y2, Y11
+ VPSRLQ $0x04, Y10, Y12
+ VPSHUFB Y11, Y0, Y11
+ VPAND Y12, Y2, Y12
+ VPSHUFB Y12, Y1, Y12
+ XOR3WAY( $0x00, Y11, Y12, Y6)
+ VPXOR Y3, Y7, Y7
+ VPXOR Y5, Y9, Y9
+ VPXOR Y4, Y8, Y8
+ VPXOR Y6, Y10, Y10
+ VPXOR Y5, Y3, Y5
+ VPXOR Y6, Y4, Y6
+ VPXOR Y7, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VMOVDQU Y3, (SI)
+ VMOVDQU Y4, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y5, (DI)
+ VMOVDQU Y6, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y7, (R8)
+ VMOVDQU Y8, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y9, (AX)
+ VMOVDQU Y10, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, SSE2
+TEXT ·ifftDIT48_avx2_7(SB), NOSPLIT, $0-56
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X0
+ VPBROADCASTB X0, Y0
+
+loop:
+ VMOVDQU (SI), Y0
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU 32(DI), Y3
+ VPXOR Y1, Y0, Y1
+ VPXOR Y3, Y2, Y3
+ VMOVDQU (R8), Y4
+ VMOVDQU (AX), Y5
+ VMOVDQU 32(R8), Y6
+ VMOVDQU 32(AX), Y7
+ VPXOR Y4, Y5, Y5
+ VPXOR Y6, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y1, Y5, Y5
+ VPXOR Y2, Y6, Y6
+ VPXOR Y3, Y7, Y7
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y2, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y1, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y4, (R8)
+ VMOVDQU Y6, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y5, (AX)
+ VMOVDQU Y7, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8)
+// Requires: AVX, AVX2, SSE2
+TEXT ·fftDIT48_avx2_7(SB), NOSPLIT, $0-56
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+ MOVQ $0x0000000f, CX
+ MOVQ CX, X0
+ VPBROADCASTB X0, Y0
+
+loop:
+ VMOVDQU (SI), Y0
+ VMOVDQU 32(SI), Y1
+ VMOVDQU (R8), Y4
+ VMOVDQU 32(R8), Y5
+ VMOVDQU (DI), Y2
+ VMOVDQU 32(DI), Y3
+ VMOVDQU (AX), Y6
+ VMOVDQU 32(AX), Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y2, Y6, Y6
+ VPXOR Y1, Y5, Y5
+ VPXOR Y3, Y7, Y7
+ VPXOR Y2, Y0, Y2
+ VPXOR Y3, Y1, Y3
+ VPXOR Y4, Y6, Y6
+ VPXOR Y5, Y7, Y7
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ ADDQ $0x40, SI
+ VMOVDQU Y2, (DI)
+ VMOVDQU Y3, 32(DI)
+ ADDQ $0x40, DI
+ VMOVDQU Y4, (R8)
+ VMOVDQU Y5, 32(R8)
+ ADDQ $0x40, R8
+ VMOVDQU Y6, (AX)
+ VMOVDQU Y7, 32(AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t23+40(FP), Z1
+ VBROADCASTF32X2 t02+48(FP), Z2
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z3
+ VMOVDQU64 (DI), Z4
+ VMOVDQU64 (R8), Z5
+ VMOVDQU64 (AX), Z6
+ VXORPD Z4, Z3, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z7
+ VXORPD Z3, Z7, Z3
+ VXORPD Z5, Z6, Z6
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
+ VPTERNLOGD $0x96, Z7, Z3, Z5
+ VXORPD Z4, Z6, Z6
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z7
+ VXORPD Z3, Z7, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VMOVDQU64 Z3, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z4, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z5, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z6, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t23+40(FP), Z1
+ VBROADCASTF32X2 t02+48(FP), Z2
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z3
+ VMOVDQU64 (DI), Z4
+ VMOVDQU64 (R8), Z5
+ VMOVDQU64 (AX), Z6
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z7
+ VXORPD Z3, Z7, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VXORPD Z3, Z5, Z5
+ VXORPD Z4, Z6, Z6
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z7
+ VXORPD Z3, Z7, Z3
+ VXORPD Z4, Z3, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
+ VXORPD Z5, Z7, Z5
+ VXORPD Z5, Z6, Z6
+ VMOVDQU64 Z3, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z4, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z5, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z6, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t23+40(FP), Z0
+ VBROADCASTF32X2 t02+48(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+ VXORPD Z3, Z2, Z3
+ VXORPD Z4, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z6
+ VPTERNLOGD $0x96, Z6, Z2, Z4
+ VXORPD Z3, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
+ VXORPD Z2, Z6, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z3, Z6, Z3
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t23+40(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+ VXORPD Z2, Z4, Z4
+ VXORPD Z3, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
+ VXORPD Z2, Z6, Z2
+ VXORPD Z3, Z2, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z4, Z6, Z4
+ VXORPD Z4, Z5, Z5
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t02+48(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+ VXORPD Z3, Z2, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
+ VXORPD Z2, Z6, Z2
+ VXORPD Z4, Z5, Z5
+ VXORPD Z2, Z4, Z4
+ VXORPD Z3, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
+ VXORPD Z2, Z6, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z3, Z6, Z3
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t23+40(FP), Z0
+ VBROADCASTF32X2 t02+48(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
+ VXORPD Z2, Z6, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z3, Z6, Z3
+ VXORPD Z2, Z4, Z4
+ VXORPD Z3, Z5, Z5
+ VXORPD Z3, Z2, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z6
+ VXORPD Z4, Z6, Z4
+ VXORPD Z4, Z5, Z5
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t02+48(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z2, Z1, Z2
+ VXORPD Z3, Z4, Z4
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z5
+ VXORPD Z1, Z5, Z1
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VXORPD Z2, Z5, Z2
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_3(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t23+40(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+ VXORPD Z2, Z1, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VXORPD Z3, Z5, Z3
+ VXORPD Z3, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t23+40(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+ VXORPD Z3, Z2, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
+ VXORPD Z2, Z6, Z2
+ VXORPD Z4, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VPTERNLOGD $0x96, Z6, Z2, Z4
+ VXORPD Z3, Z5, Z5
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t02+48(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
+ VXORPD Z2, Z6, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z3, Z6, Z3
+ VXORPD Z2, Z4, Z4
+ VXORPD Z3, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
+ VXORPD Z2, Z6, Z2
+ VXORPD Z3, Z2, Z3
+ VXORPD Z4, Z5, Z5
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t23+40(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z2, Z1, Z2
+ VXORPD Z3, Z4, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VPTERNLOGD $0x96, Z5, Z1, Z3
+ VXORPD Z2, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z2, Z5
+ VXORPD Z1, Z5, Z1
+ VXORPD Z2, Z1, Z2
+ VXORPD Z3, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z2, Z1, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z2, Z5
+ VXORPD Z1, Z5, Z1
+ VXORPD Z3, Z4, Z4
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t02+48(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z5
+ VXORPD Z1, Z5, Z1
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VXORPD Z2, Z5, Z2
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+ VXORPD Z2, Z1, Z2
+ VXORPD Z3, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F
+TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z0
+ VMOVDQU64 (DI), Z1
+ VMOVDQU64 (R8), Z2
+ VMOVDQU64 (AX), Z3
+ VXORPD Z1, Z0, Z1
+ VXORPD Z2, Z3, Z3
+ VXORPD Z0, Z2, Z2
+ VXORPD Z1, Z3, Z3
+ VMOVDQU64 Z0, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z1, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z2, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z3, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F
+TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z0
+ VMOVDQU64 (DI), Z1
+ VMOVDQU64 (R8), Z2
+ VMOVDQU64 (AX), Z3
+ VXORPD Z0, Z2, Z2
+ VXORPD Z1, Z3, Z3
+ VXORPD Z1, Z0, Z1
+ VXORPD Z2, Z3, Z3
+ VMOVDQU64 Z0, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z1, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z2, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z3, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go
new file mode 100644
index 000000000..1bb268a3b
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go
@@ -0,0 +1,33 @@
+//go:build !amd64 || noasm || appengine || gccgo || nogen
+
+package reedsolomon
+
+const maxAvx2Inputs = 1
+const maxAvx2Outputs = 1
+const minAvx2Size = 1
+const avxSizeMask = 0
+const avx2CodeGen = false
+
+func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
+ panic("codegen not available")
+}
+
+func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
+ panic("codegen not available")
+}
+
+func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
+ panic("codegen not available")
+}
+
+func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
+ panic("codegen not available")
+}
+
+func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
+ panic("codegen not available")
+}
+
+func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
+ panic("codegen not available")
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.go
new file mode 100644
index 000000000..298bf5040
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.go
@@ -0,0 +1,2264 @@
+// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT.
+
+//go:build !appengine && !noasm && !nogen && nopshufb && gc
+
+package reedsolomon
+
+func _dummy_()
+
+//go:noescape
+func sSE2XorSlice(in []byte, out []byte)
+
+//go:noescape
+func sSE2XorSlice_64(in []byte, out []byte)
+
+//go:noescape
+func avx2XorSlice_64(in []byte, out []byte)
+
+// mulGFNI_1x1_64 takes 1 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x1 takes 1 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x1Xor takes 1 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x2_64 takes 1 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x2 takes 1 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x2Xor takes 1 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x3_64 takes 1 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x3 takes 1 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x3Xor takes 1 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x4_64 takes 1 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x4 takes 1 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x4Xor takes 1 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x5_64 takes 1 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x5 takes 1 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x5Xor takes 1 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x6_64 takes 1 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x6 takes 1 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x6Xor takes 1 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x7_64 takes 1 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x7 takes 1 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x7Xor takes 1 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x8_64 takes 1 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x8 takes 1 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x8Xor takes 1 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x9_64 takes 1 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x9 takes 1 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x9Xor takes 1 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x10_64 takes 1 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x10 takes 1 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_1x10Xor takes 1 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x1_64 takes 2 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x1 takes 2 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x1Xor takes 2 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x2_64 takes 2 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x2 takes 2 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x2Xor takes 2 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x3_64 takes 2 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x3 takes 2 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x3Xor takes 2 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x4_64 takes 2 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x4 takes 2 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x4Xor takes 2 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x5_64 takes 2 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x5 takes 2 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x5Xor takes 2 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x6_64 takes 2 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x6 takes 2 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x6Xor takes 2 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x7_64 takes 2 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x7 takes 2 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x7Xor takes 2 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x8_64 takes 2 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x8 takes 2 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x8Xor takes 2 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x9_64 takes 2 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x9 takes 2 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x9Xor takes 2 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x10_64 takes 2 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x10 takes 2 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_2x10Xor takes 2 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x1_64 takes 3 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x1 takes 3 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x1Xor takes 3 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x2_64 takes 3 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x2 takes 3 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x2Xor takes 3 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x3_64 takes 3 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x3 takes 3 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x3Xor takes 3 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x4_64 takes 3 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x4 takes 3 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x4Xor takes 3 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x5_64 takes 3 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x5 takes 3 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x5Xor takes 3 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x6_64 takes 3 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x6 takes 3 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x6Xor takes 3 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x7_64 takes 3 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x7 takes 3 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x7Xor takes 3 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x8_64 takes 3 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x8 takes 3 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x8Xor takes 3 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x9_64 takes 3 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x9 takes 3 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x9Xor takes 3 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x10_64 takes 3 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x10 takes 3 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_3x10Xor takes 3 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x1_64 takes 4 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x1 takes 4 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x1Xor takes 4 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x2_64 takes 4 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x2 takes 4 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x2Xor takes 4 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x3_64 takes 4 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x3 takes 4 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x3Xor takes 4 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x4_64 takes 4 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x4 takes 4 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x4Xor takes 4 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x5_64 takes 4 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x5 takes 4 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x5Xor takes 4 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x6_64 takes 4 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x6 takes 4 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x6Xor takes 4 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x7_64 takes 4 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x7 takes 4 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x7Xor takes 4 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x8_64 takes 4 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x8 takes 4 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x8Xor takes 4 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x9_64 takes 4 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x9 takes 4 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x9Xor takes 4 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x10_64 takes 4 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x10 takes 4 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_4x10Xor takes 4 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x1_64 takes 5 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x1 takes 5 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x1Xor takes 5 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x2_64 takes 5 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x2 takes 5 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x2Xor takes 5 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x3_64 takes 5 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x3 takes 5 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x3Xor takes 5 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x4_64 takes 5 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x4 takes 5 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x4Xor takes 5 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x5_64 takes 5 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x5 takes 5 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x5Xor takes 5 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x6_64 takes 5 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x6 takes 5 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x6Xor takes 5 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x7_64 takes 5 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x7 takes 5 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x7Xor takes 5 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x8_64 takes 5 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x8 takes 5 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x8Xor takes 5 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x9_64 takes 5 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x9 takes 5 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x9Xor takes 5 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x10_64 takes 5 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x10 takes 5 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_5x10Xor takes 5 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x1_64 takes 6 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x1 takes 6 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x1Xor takes 6 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x2_64 takes 6 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x2 takes 6 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x2Xor takes 6 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x3_64 takes 6 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x3 takes 6 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x3Xor takes 6 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x4_64 takes 6 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x4 takes 6 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x4Xor takes 6 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x5_64 takes 6 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x5 takes 6 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x5Xor takes 6 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x6_64 takes 6 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x6 takes 6 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x6Xor takes 6 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x7_64 takes 6 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x7 takes 6 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x7Xor takes 6 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x8_64 takes 6 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x8 takes 6 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x8Xor takes 6 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x9_64 takes 6 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x9 takes 6 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x9Xor takes 6 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x10_64 takes 6 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x10 takes 6 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_6x10Xor takes 6 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x1_64 takes 7 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x1 takes 7 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x1Xor takes 7 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x2_64 takes 7 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x2 takes 7 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x2Xor takes 7 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x3_64 takes 7 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x3 takes 7 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x3Xor takes 7 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x4_64 takes 7 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x4 takes 7 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x4Xor takes 7 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x5_64 takes 7 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x5 takes 7 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x5Xor takes 7 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x6_64 takes 7 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x6 takes 7 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x6Xor takes 7 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x7_64 takes 7 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x7 takes 7 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x7Xor takes 7 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x8_64 takes 7 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x8 takes 7 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x8Xor takes 7 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x9_64 takes 7 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x9 takes 7 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x9Xor takes 7 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x10_64 takes 7 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x10 takes 7 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_7x10Xor takes 7 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x1_64 takes 8 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x1 takes 8 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x1Xor takes 8 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x2_64 takes 8 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x2 takes 8 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x2Xor takes 8 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x3_64 takes 8 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x3 takes 8 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x3Xor takes 8 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x4_64 takes 8 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x4 takes 8 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x4Xor takes 8 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x5_64 takes 8 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x5 takes 8 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x5Xor takes 8 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x6_64 takes 8 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x6 takes 8 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x6Xor takes 8 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x7_64 takes 8 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x7 takes 8 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x7Xor takes 8 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x8_64 takes 8 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x8 takes 8 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x8Xor takes 8 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x9_64 takes 8 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x9 takes 8 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x9Xor takes 8 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x10_64 takes 8 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x10 takes 8 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_8x10Xor takes 8 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x1_64 takes 9 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x1 takes 9 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x1Xor takes 9 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x2_64 takes 9 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x2 takes 9 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x2Xor takes 9 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x3_64 takes 9 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x3 takes 9 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x3Xor takes 9 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x4_64 takes 9 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x4 takes 9 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x4Xor takes 9 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x5_64 takes 9 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x5 takes 9 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x5Xor takes 9 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x6_64 takes 9 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x6 takes 9 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x6Xor takes 9 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x7_64 takes 9 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x7 takes 9 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x7Xor takes 9 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x8_64 takes 9 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x8 takes 9 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x8Xor takes 9 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x9_64 takes 9 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x9 takes 9 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x9Xor takes 9 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x10_64 takes 9 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x10 takes 9 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_9x10Xor takes 9 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x1_64 takes 10 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x1 takes 10 inputs and produces 1 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x1Xor takes 10 inputs and produces 1 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x2_64 takes 10 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x2 takes 10 inputs and produces 2 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x2Xor takes 10 inputs and produces 2 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x3_64 takes 10 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x3 takes 10 inputs and produces 3 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x3Xor takes 10 inputs and produces 3 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x4_64 takes 10 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x4 takes 10 inputs and produces 4 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x4Xor takes 10 inputs and produces 4 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x5_64 takes 10 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x5 takes 10 inputs and produces 5 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x5Xor takes 10 inputs and produces 5 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x6_64 takes 10 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x6 takes 10 inputs and produces 6 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x6Xor takes 10 inputs and produces 6 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x7_64 takes 10 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x7 takes 10 inputs and produces 7 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x7Xor takes 10 inputs and produces 7 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x8_64 takes 10 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x8 takes 10 inputs and produces 8 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x8Xor takes 10 inputs and produces 8 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x9_64 takes 10 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x9 takes 10 inputs and produces 9 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x9Xor takes 10 inputs and produces 9 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x10_64 takes 10 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x10 takes 10 inputs and produces 10 outputs.
+// The output is initialized to 0.
+//
+//go:noescape
+func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+// mulAvxGFNI_10x10Xor takes 10 inputs and produces 10 outputs.
+//
+//go:noescape
+func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+
+//go:noescape
+func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+
+//go:noescape
+func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s
new file mode 100644
index 000000000..5782759c6
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s
@@ -0,0 +1,67987 @@
+// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT.
+
+//go:build !appengine && !noasm && !nogen && nopshufb && gc
+
+#include "textflag.h"
+
+// func _dummy_()
+TEXT ·_dummy_(SB), $0
+#ifdef GOAMD64_v4
+#define XOR3WAY(ignore, a, b, dst) \
+ VPTERNLOGD $0x96, a, b, dst
+
+#else
+#define XOR3WAY(ignore, a, b, dst) \
+ VPXOR a, dst, dst \
+ VPXOR b, dst, dst
+
+#endif
+ RET
+
+// sSE2XorSlice will XOR in with out and store in out.
+// Processes 16 bytes/loop.
+
+// func sSE2XorSlice(in []byte, out []byte)
+// Requires: SSE2
+TEXT ·sSE2XorSlice(SB), $0-48
+ MOVQ in_base+0(FP), AX
+ MOVQ out_base+24(FP), CX
+ MOVQ in_len+8(FP), DX
+ SHRQ $0x04, DX
+ JZ end
+
+loop:
+ MOVOU (AX), X0
+ MOVOU (CX), X1
+ PXOR X0, X1
+ MOVOU X1, (CX)
+ ADDQ $0x10, AX
+ ADDQ $0x10, CX
+ DECQ DX
+ JNZ loop
+
+end:
+ RET
+
+// sSE2XorSlice_64 will XOR in with out and store in out.
+// Processes 64 bytes/loop.
+
+// func sSE2XorSlice_64(in []byte, out []byte)
+// Requires: SSE2
+TEXT ·sSE2XorSlice_64(SB), $0-48
+ MOVQ in_base+0(FP), AX
+ MOVQ out_base+24(FP), CX
+ MOVQ in_len+8(FP), DX
+ SHRQ $0x06, DX
+ JZ end
+
+loop:
+ MOVOU (AX), X0
+ MOVOU 16(AX), X2
+ MOVOU 32(AX), X4
+ MOVOU 48(AX), X6
+ MOVOU (CX), X1
+ MOVOU 16(CX), X3
+ MOVOU 32(CX), X5
+ MOVOU 48(CX), X7
+ PXOR X0, X1
+ PXOR X2, X3
+ PXOR X4, X5
+ PXOR X6, X7
+ MOVOU X1, (CX)
+ MOVOU X3, 16(CX)
+ MOVOU X5, 32(CX)
+ MOVOU X7, 48(CX)
+ ADDQ $0x40, AX
+ ADDQ $0x40, CX
+ DECQ DX
+ JNZ loop
+
+end:
+ RET
+
+// avx2XorSlice_64 will XOR in with out and store in out.
+// Processes 64 bytes/loop.
+
+// func avx2XorSlice_64(in []byte, out []byte)
+// Requires: AVX, AVX2
+TEXT ·avx2XorSlice_64(SB), $0-48
+ MOVQ in_base+0(FP), AX
+ MOVQ out_base+24(FP), CX
+ MOVQ in_len+8(FP), DX
+ SHRQ $0x06, DX
+ JZ end
+
+loop:
+ VMOVDQU (AX), Y0
+ VMOVDQU 32(AX), Y2
+ VMOVDQU (CX), Y1
+ VMOVDQU 32(CX), Y3
+ VPXOR Y0, Y1, Y1
+ VPXOR Y2, Y3, Y3
+ VMOVDQU Y1, (CX)
+ VMOVDQU Y3, 32(CX)
+ ADDQ $0x40, AX
+ ADDQ $0x40, CX
+ DECQ DX
+ JNZ loop
+
+end:
+ VZEROUPPER
+ RET
+
+// func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 4 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+
+mulGFNI_1x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (CX), Z1
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z1, Z1
+
+ // Store 1 outputs
+ VMOVDQU64 Z1, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x1_64_loop
+ VZEROUPPER
+
+mulGFNI_1x1_64_end:
+ RET
+
+// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 4 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x1_end
+ VBROADCASTSD (CX), Y0
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+
+mulAvxGFNI_1x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (CX), Y1
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y1, Y1
+
+ // Store 1 outputs
+ VMOVDQU Y1, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x1_end:
+ RET
+
+// func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 4 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+
+mulGFNI_1x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (DX), Z1
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (CX), Z2
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z2, Z2
+ VXORPD Z1, Z2, Z1
+
+ // Store 1 outputs
+ VMOVDQU64 Z1, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 4 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x1Xor_end
+ VBROADCASTSD (CX), Y0
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), DX
+ MOVQ start+72(FP), BX
+
+ // Add start offset to output
+ ADDQ BX, DX
+
+ // Add start offset to input
+ ADDQ BX, CX
+
+mulAvxGFNI_1x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (DX), Y1
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (CX), Y2
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y2, Y2
+ VXORPD Y1, Y2, Y1
+
+ // Store 1 outputs
+ VMOVDQU Y1, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x1Xor_end:
+ RET
+
+// func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+ ADDQ SI, DX
+
+ // Add start offset to input
+ ADDQ SI, CX
+
+mulGFNI_1x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (CX), Z3
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z2
+ VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
+
+ // Store 2 outputs
+ VMOVDQU64 Z2, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z3, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x2_64_loop
+ VZEROUPPER
+
+mulGFNI_1x2_64_end:
+ RET
+
+// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+ ADDQ SI, DX
+
+ // Add start offset to input
+ ADDQ SI, CX
+
+mulAvxGFNI_1x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (CX), Y3
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y3, Y2
+ VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
+
+ // Store 2 outputs
+ VMOVDQU Y2, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y3, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x2_end:
+ RET
+
+// func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+ ADDQ SI, DX
+
+ // Add start offset to input
+ ADDQ SI, CX
+
+mulGFNI_1x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (BX), Z2
+ VMOVDQU64 (DX), Z3
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (CX), Z4
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VXORPD Z2, Z5, Z2
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z5
+ VXORPD Z3, Z5, Z3
+
+ // Store 2 outputs
+ VMOVDQU64 Z2, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z3, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+ ADDQ SI, DX
+
+ // Add start offset to input
+ ADDQ SI, CX
+
+mulAvxGFNI_1x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (BX), Y2
+ VMOVDQU (DX), Y3
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (CX), Y4
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y4, Y5
+ VXORPD Y2, Y5, Y2
+ VGF2P8AFFINEQB $0x00, Y1, Y4, Y5
+ VXORPD Y3, Y5, Y3
+
+ // Store 2 outputs
+ VMOVDQU Y2, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y3, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x2Xor_end:
+ RET
+
+// func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, BX
+ ADDQ DI, SI
+ ADDQ DI, DX
+
+ // Add start offset to input
+ ADDQ DI, CX
+
+mulGFNI_1x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (CX), Z5
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z3
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z4
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
+
+ // Store 3 outputs
+ VMOVDQU64 Z3, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z4, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z5, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x3_64_loop
+ VZEROUPPER
+
+mulGFNI_1x3_64_end:
+ RET
+
+// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x3(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, BX
+ ADDQ DI, SI
+ ADDQ DI, DX
+
+ // Add start offset to input
+ ADDQ DI, CX
+
+mulAvxGFNI_1x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (CX), Y5
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y5, Y3
+ VGF2P8AFFINEQB $0x00, Y1, Y5, Y4
+ VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
+
+ // Store 3 outputs
+ VMOVDQU Y3, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x3_end:
+ RET
+
+// func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, BX
+ ADDQ DI, SI
+ ADDQ DI, DX
+
+ // Add start offset to input
+ ADDQ DI, CX
+
+mulGFNI_1x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (BX), Z3
+ VMOVDQU64 (SI), Z4
+ VMOVDQU64 (DX), Z5
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z7
+ VXORPD Z3, Z7, Z3
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z5, Z7, Z5
+
+ // Store 3 outputs
+ VMOVDQU64 Z3, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z4, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z5, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, BX
+ ADDQ DI, SI
+ ADDQ DI, DX
+
+ // Add start offset to input
+ ADDQ DI, CX
+
+mulAvxGFNI_1x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (BX), Y3
+ VMOVDQU (SI), Y4
+ VMOVDQU (DX), Y5
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y7
+ VXORPD Y3, Y7, Y3
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y7
+ VXORPD Y4, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
+ VXORPD Y5, Y7, Y5
+
+ // Store 3 outputs
+ VMOVDQU Y3, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x3Xor_end:
+ RET
+
+// func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, DX
+
+ // Add start offset to input
+ ADDQ R8, CX
+
+mulGFNI_1x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (CX), Z7
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z1, Z7, Z5
+ VGF2P8AFFINEQB $0x00, Z2, Z7, Z6
+ VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
+
+ // Store 4 outputs
+ VMOVDQU64 Z4, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z5, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z6, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z7, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x4_64_loop
+ VZEROUPPER
+
+mulGFNI_1x4_64_end:
+ RET
+
+// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x4(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, DX
+
+ // Add start offset to input
+ ADDQ R8, CX
+
+mulAvxGFNI_1x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (CX), Y7
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y7, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y7, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
+
+ // Store 4 outputs
+ VMOVDQU Y4, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x4_end:
+ RET
+
+// func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, DX
+
+ // Add start offset to input
+ ADDQ R8, CX
+
+mulGFNI_1x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (BX), Z4
+ VMOVDQU64 (SI), Z5
+ VMOVDQU64 (DI), Z6
+ VMOVDQU64 (DX), Z7
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z9
+ VXORPD Z4, Z9, Z4
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z9
+ VXORPD Z5, Z9, Z5
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Store 4 outputs
+ VMOVDQU64 Z4, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z5, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z6, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z7, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, DX
+
+ // Add start offset to input
+ ADDQ R8, CX
+
+mulAvxGFNI_1x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (BX), Y4
+ VMOVDQU (SI), Y5
+ VMOVDQU (DI), Y6
+ VMOVDQU (DX), Y7
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y9
+ VXORPD Y4, Y9, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y9
+ VXORPD Y5, Y9, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Store 4 outputs
+ VMOVDQU Y4, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x4Xor_end:
+ RET
+
+// func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, DX
+
+ // Add start offset to input
+ ADDQ R9, CX
+
+mulGFNI_1x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z5
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z7
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z8
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
+
+ // Store 5 outputs
+ VMOVDQU64 Z5, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z6, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z7, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z8, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z9, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x5_64_loop
+ VZEROUPPER
+
+mulGFNI_1x5_64_end:
+ RET
+
+// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x5(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, DX
+
+ // Add start offset to input
+ ADDQ R9, CX
+
+mulAvxGFNI_1x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
+
+ // Store 5 outputs
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x5_end:
+ RET
+
+// func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, DX
+
+ // Add start offset to input
+ ADDQ R9, CX
+
+mulGFNI_1x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (BX), Z5
+ VMOVDQU64 (SI), Z6
+ VMOVDQU64 (DI), Z7
+ VMOVDQU64 (R8), Z8
+ VMOVDQU64 (DX), Z9
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z11
+ VXORPD Z5, Z11, Z5
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z11
+ VXORPD Z6, Z11, Z6
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
+ VXORPD Z7, Z11, Z7
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Store 5 outputs
+ VMOVDQU64 Z5, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z6, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z7, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z8, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z9, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, DX
+
+ // Add start offset to input
+ ADDQ R9, CX
+
+mulAvxGFNI_1x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (BX), Y5
+ VMOVDQU (SI), Y6
+ VMOVDQU (DI), Y7
+ VMOVDQU (R8), Y8
+ VMOVDQU (DX), Y9
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y11
+ VXORPD Y5, Y11, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y11
+ VXORPD Y6, Y11, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
+ VXORPD Y7, Y11, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Store 5 outputs
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x5Xor_end:
+ RET
+
+// func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x6_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DX
+
+ // Add start offset to input
+ ADDQ R10, CX
+
+mulGFNI_1x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (CX), Z11
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z11, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z11, Z7
+ VGF2P8AFFINEQB $0x00, Z2, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z3, Z11, Z9
+ VGF2P8AFFINEQB $0x00, Z4, Z11, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
+
+ // Store 6 outputs
+ VMOVDQU64 Z6, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z7, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z8, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z9, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z10, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z11, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x6_64_loop
+ VZEROUPPER
+
+mulGFNI_1x6_64_end:
+ RET
+
+// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x6(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DX
+
+ // Add start offset to input
+ ADDQ R10, CX
+
+mulAvxGFNI_1x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (CX), Y11
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y11, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y11, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y11, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y11, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
+
+ // Store 6 outputs
+ VMOVDQU Y6, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x6_end:
+ RET
+
+// func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x6_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DX
+
+ // Add start offset to input
+ ADDQ R10, CX
+
+mulGFNI_1x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (BX), Z6
+ VMOVDQU64 (SI), Z7
+ VMOVDQU64 (DI), Z8
+ VMOVDQU64 (R8), Z9
+ VMOVDQU64 (R9), Z10
+ VMOVDQU64 (DX), Z11
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
+ VXORPD Z6, Z13, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
+ VXORPD Z7, Z13, Z7
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z8, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 6 outputs
+ VMOVDQU64 Z6, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z7, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z8, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z9, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z10, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z11, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DX
+
+ // Add start offset to input
+ ADDQ R10, CX
+
+mulAvxGFNI_1x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (BX), Y6
+ VMOVDQU (SI), Y7
+ VMOVDQU (DI), Y8
+ VMOVDQU (R8), Y9
+ VMOVDQU (R9), Y10
+ VMOVDQU (DX), Y11
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
+ VXORPD Y6, Y13, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
+ VXORPD Y7, Y13, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y8, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 6 outputs
+ VMOVDQU Y6, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x6Xor_end:
+ RET
+
+// func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x7_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DX
+
+ // Add start offset to input
+ ADDQ R11, CX
+
+mulGFNI_1x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (CX), Z13
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z13, Z7
+ VGF2P8AFFINEQB $0x00, Z1, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z2, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z3, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z4, Z13, Z11
+ VGF2P8AFFINEQB $0x00, Z5, Z13, Z12
+ VGF2P8AFFINEQB $0x00, Z6, Z13, Z13
+
+ // Store 7 outputs
+ VMOVDQU64 Z7, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z8, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z12, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z13, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x7_64_loop
+ VZEROUPPER
+
+mulGFNI_1x7_64_end:
+ RET
+
+// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x7(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DX
+
+ // Add start offset to input
+ ADDQ R11, CX
+
+mulAvxGFNI_1x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (CX), Y13
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y13, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y13, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y13, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y13, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x7_end:
+ RET
+
+// func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x7_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DX
+
+ // Add start offset to input
+ ADDQ R11, CX
+
+mulGFNI_1x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (BX), Z7
+ VMOVDQU64 (SI), Z8
+ VMOVDQU64 (DI), Z9
+ VMOVDQU64 (R8), Z10
+ VMOVDQU64 (R9), Z11
+ VMOVDQU64 (R10), Z12
+ VMOVDQU64 (DX), Z13
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (CX), Z14
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z14, Z15
+ VXORPD Z7, Z15, Z7
+ VGF2P8AFFINEQB $0x00, Z1, Z14, Z15
+ VXORPD Z8, Z15, Z8
+ VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
+ VXORPD Z9, Z15, Z9
+ VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
+ VXORPD Z10, Z15, Z10
+ VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
+ VXORPD Z11, Z15, Z11
+ VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Store 7 outputs
+ VMOVDQU64 Z7, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z8, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z12, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z13, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DX
+
+ // Add start offset to input
+ ADDQ R11, CX
+
+mulAvxGFNI_1x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (BX), Y7
+ VMOVDQU (SI), Y8
+ VMOVDQU (DI), Y9
+ VMOVDQU (R8), Y10
+ VMOVDQU (R9), Y11
+ VMOVDQU (R10), Y12
+ VMOVDQU (DX), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (CX), Y14
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (BX)
+ ADDQ $0x20, BX
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (DX)
+ ADDQ $0x20, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x7Xor_end:
+ RET
+
+// func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x8_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DX
+
+ // Add start offset to input
+ ADDQ R12, CX
+
+mulGFNI_1x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z9
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z11
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z13
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z15
+
+ // Store 8 outputs
+ VMOVDQU64 Z8, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z9, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z10, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z11, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z12, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z13, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z14, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z15, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x8_64_loop
+ VZEROUPPER
+
+mulGFNI_1x8_64_end:
+ RET
+
+// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x8(SB), $0-88
+ // Loading 6 of 8 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+
+mulAvxGFNI_1x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y13, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y13, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y13, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
+ VBROADCASTSD 56(CX), Y14
+ VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x8_end:
+ RET
+
+// func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x8_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DX
+
+ // Add start offset to input
+ ADDQ R12, CX
+
+mulGFNI_1x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (BX), Z8
+ VMOVDQU64 (SI), Z9
+ VMOVDQU64 (DI), Z10
+ VMOVDQU64 (R8), Z11
+ VMOVDQU64 (R9), Z12
+ VMOVDQU64 (R10), Z13
+ VMOVDQU64 (R11), Z14
+ VMOVDQU64 (DX), Z15
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
+ VXORPD Z8, Z17, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
+ VXORPD Z9, Z17, Z9
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
+ VXORPD Z10, Z17, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
+ VXORPD Z11, Z17, Z11
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 8 outputs
+ VMOVDQU64 Z8, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z9, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z10, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z11, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z12, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z13, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z14, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z15, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88
+ // Loading 6 of 8 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+
+mulAvxGFNI_1x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU (DI), Y7
+ VMOVDQU (R8), Y8
+ VMOVDQU (R9), Y9
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (BX), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x8Xor_end:
+ RET
+
+// func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x9_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DX
+
+ // Add start offset to input
+ ADDQ R13, CX
+
+mulGFNI_1x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (CX), Z17
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z17, Z9
+ VGF2P8AFFINEQB $0x00, Z1, Z17, Z10
+ VGF2P8AFFINEQB $0x00, Z2, Z17, Z11
+ VGF2P8AFFINEQB $0x00, Z3, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z4, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z5, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z6, Z17, Z15
+ VGF2P8AFFINEQB $0x00, Z7, Z17, Z16
+ VGF2P8AFFINEQB $0x00, Z8, Z17, Z17
+
+ // Store 9 outputs
+ VMOVDQU64 Z9, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z10, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z11, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z15, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z16, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z17, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x9_64_loop
+ VZEROUPPER
+
+mulGFNI_1x9_64_end:
+ RET
+
+// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x9(SB), $0-88
+ // Loading 5 of 9 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+
+mulAvxGFNI_1x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y13, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y13, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y13, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y13, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y13, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y13, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
+ VBROADCASTSD 64(CX), Y14
+ VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x9_end:
+ RET
+
+// func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x9_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DX
+
+ // Add start offset to input
+ ADDQ R13, CX
+
+mulGFNI_1x9_64Xor_loop:
+ // Load 9 outputs
+ VMOVDQU64 (BX), Z9
+ VMOVDQU64 (SI), Z10
+ VMOVDQU64 (DI), Z11
+ VMOVDQU64 (R8), Z12
+ VMOVDQU64 (R9), Z13
+ VMOVDQU64 (R10), Z14
+ VMOVDQU64 (R11), Z15
+ VMOVDQU64 (R12), Z16
+ VMOVDQU64 (DX), Z17
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
+ VXORPD Z9, Z19, Z9
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
+ VXORPD Z10, Z19, Z10
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z11, Z19, Z11
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z12, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z13, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z14, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 9 outputs
+ VMOVDQU64 Z9, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z10, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z11, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z15, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z16, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z17, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88
+ // Loading 5 of 9 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+
+mulAvxGFNI_1x9Xor_loop:
+ // Load 9 outputs
+ VMOVDQU (SI), Y5
+ VMOVDQU (DI), Y6
+ VMOVDQU (R8), Y7
+ VMOVDQU (R9), Y8
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (BX), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x9Xor_end:
+ RET
+
+// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x10_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DX
+
+ // Add start offset to input
+ ADDQ R14, CX
+
+mulGFNI_1x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (CX), Z19
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z19, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z19, Z11
+ VGF2P8AFFINEQB $0x00, Z2, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z4, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z5, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z6, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z19, Z17
+ VGF2P8AFFINEQB $0x00, Z8, Z19, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z19, Z19
+
+ // Store 10 outputs
+ VMOVDQU64 Z10, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z11, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z12, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z13, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z14, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z15, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z16, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z17, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z18, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z19, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x10_64_loop
+ VZEROUPPER
+
+mulGFNI_1x10_64_end:
+ RET
+
+// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x10(SB), $0-88
+ // Loading 4 of 10 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+
+mulAvxGFNI_1x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y13
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y13, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y13, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y13, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y13, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y13, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y13, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y13, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y13, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
+ VBROADCASTSD 72(CX), Y14
+ VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x10_end:
+ RET
+
+// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_1x10_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_1x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), CX
+ MOVQ out_base+48(FP), DX
+ MOVQ out_base+48(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DX
+
+ // Add start offset to input
+ ADDQ R14, CX
+
+mulGFNI_1x10_64Xor_loop:
+ // Load 10 outputs
+ VMOVDQU64 (BX), Z10
+ VMOVDQU64 (SI), Z11
+ VMOVDQU64 (DI), Z12
+ VMOVDQU64 (R8), Z13
+ VMOVDQU64 (R9), Z14
+ VMOVDQU64 (R10), Z15
+ VMOVDQU64 (R11), Z16
+ VMOVDQU64 (R12), Z17
+ VMOVDQU64 (R13), Z18
+ VMOVDQU64 (DX), Z19
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
+ VXORPD Z10, Z21, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
+ VXORPD Z11, Z21, Z11
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z12, Z21, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z13, Z21, Z13
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z14, Z21, Z14
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 10 outputs
+ VMOVDQU64 Z10, (BX)
+ ADDQ $0x40, BX
+ VMOVDQU64 Z11, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z12, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z13, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z14, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z15, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z16, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z17, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z18, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z19, (DX)
+ ADDQ $0x40, DX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_1x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_1x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88
+ // Loading 4 of 10 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_1x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), DX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+
+mulAvxGFNI_1x10Xor_loop:
+ // Load 10 outputs
+ VMOVDQU (SI), Y4
+ VMOVDQU (DI), Y5
+ VMOVDQU (R8), Y6
+ VMOVDQU (R9), Y7
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (BX), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_1x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_1x10Xor_end:
+ RET
+
+// func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 5 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+
+mulGFNI_2x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z3
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z2
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (CX), Z3
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
+ VXORPD Z2, Z3, Z2
+
+ // Store 1 outputs
+ VMOVDQU64 Z2, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x1_64_loop
+ VZEROUPPER
+
+mulGFNI_2x1_64_end:
+ RET
+
+// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 5 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+
+mulAvxGFNI_2x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y3
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y3, Y2
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (CX), Y3
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
+ VXORPD Y2, Y3, Y2
+
+ // Store 1 outputs
+ VMOVDQU Y2, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x1_end:
+ RET
+
+// func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 5 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+
+mulGFNI_2x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (BX), Z2
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z3
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z3
+ VXORPD Z2, Z3, Z2
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (CX), Z3
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
+ VXORPD Z2, Z3, Z2
+
+ // Store 1 outputs
+ VMOVDQU64 Z2, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 5 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), BX
+ MOVQ start+72(FP), SI
+
+ // Add start offset to output
+ ADDQ SI, BX
+
+ // Add start offset to input
+ ADDQ SI, DX
+ ADDQ SI, CX
+
+mulAvxGFNI_2x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (BX), Y2
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y3
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y3, Y3
+ VXORPD Y2, Y3, Y2
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (CX), Y3
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
+ VXORPD Y2, Y3, Y2
+
+ // Store 1 outputs
+ VMOVDQU Y2, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x1Xor_end:
+ RET
+
+// func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, CX
+
+mulGFNI_2x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z6
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z4
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z5
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z3, Z6, Z7
+ VXORPD Z5, Z7, Z5
+
+ // Store 2 outputs
+ VMOVDQU64 Z4, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z5, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x2_64_loop
+ VZEROUPPER
+
+mulGFNI_2x2_64_end:
+ RET
+
+// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, CX
+
+mulAvxGFNI_2x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y6
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y5
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
+ VXORPD Y4, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y3, Y6, Y7
+ VXORPD Y5, Y7, Y5
+
+ // Store 2 outputs
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x2_end:
+ RET
+
+// func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, CX
+
+mulGFNI_2x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (SI), Z4
+ VMOVDQU64 (BX), Z5
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z6
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
+ VXORPD Z5, Z7, Z5
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VGF2P8AFFINEQB $0x00, Z3, Z6, Z7
+ VXORPD Z5, Z7, Z5
+
+ // Store 2 outputs
+ VMOVDQU64 Z4, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z5, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), BX
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+ ADDQ DI, BX
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, CX
+
+mulAvxGFNI_2x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (SI), Y4
+ VMOVDQU (BX), Y5
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y6
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y7
+ VXORPD Y4, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y7
+ VXORPD Y5, Y7, Y5
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
+ VXORPD Y4, Y7, Y4
+ VGF2P8AFFINEQB $0x00, Y3, Y6, Y7
+ VXORPD Y5, Y7, Y5
+
+ // Store 2 outputs
+ VMOVDQU Y4, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y5, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x2Xor_end:
+ RET
+
+// func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, CX
+
+mulGFNI_2x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z9
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z7
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z8
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z10
+ VXORPD Z6, Z10, Z6
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z10
+ VXORPD Z7, Z10, Z7
+ VGF2P8AFFINEQB $0x00, Z5, Z9, Z10
+ VXORPD Z8, Z10, Z8
+
+ // Store 3 outputs
+ VMOVDQU64 Z6, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z7, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z8, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x3_64_loop
+ VZEROUPPER
+
+mulGFNI_2x3_64_end:
+ RET
+
+// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x3(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, CX
+
+mulAvxGFNI_2x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y8
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y10
+ VXORPD Y6, Y10, Y6
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y10
+ VXORPD Y7, Y10, Y7
+ VGF2P8AFFINEQB $0x00, Y5, Y9, Y10
+ VXORPD Y8, Y10, Y8
+
+ // Store 3 outputs
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x3_end:
+ RET
+
+// func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, CX
+
+mulGFNI_2x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (SI), Z6
+ VMOVDQU64 (DI), Z7
+ VMOVDQU64 (BX), Z8
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z9
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z10
+ VXORPD Z6, Z10, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z10
+ VXORPD Z7, Z10, Z7
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z10
+ VXORPD Z8, Z10, Z8
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z10
+ VXORPD Z6, Z10, Z6
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z10
+ VXORPD Z7, Z10, Z7
+ VGF2P8AFFINEQB $0x00, Z5, Z9, Z10
+ VXORPD Z8, Z10, Z8
+
+ // Store 3 outputs
+ VMOVDQU64 Z6, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z7, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z8, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), BX
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, SI
+ ADDQ R8, DI
+ ADDQ R8, BX
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, CX
+
+mulAvxGFNI_2x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (SI), Y6
+ VMOVDQU (DI), Y7
+ VMOVDQU (BX), Y8
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y10
+ VXORPD Y6, Y10, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y10
+ VXORPD Y7, Y10, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y10
+ VXORPD Y8, Y10, Y8
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y10
+ VXORPD Y6, Y10, Y6
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y10
+ VXORPD Y7, Y10, Y7
+ VGF2P8AFFINEQB $0x00, Y5, Y9, Y10
+ VXORPD Y8, Y10, Y8
+
+ // Store 3 outputs
+ VMOVDQU Y6, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x3Xor_end:
+ RET
+
+// func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, CX
+
+mulGFNI_2x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z9
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z11
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z8, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 4 outputs
+ VMOVDQU64 Z8, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x4_64_loop
+ VZEROUPPER
+
+mulGFNI_2x4_64_end:
+ RET
+
+// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x4(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, CX
+
+mulAvxGFNI_2x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y11
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y8, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 4 outputs
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x4_end:
+ RET
+
+// func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, CX
+
+mulGFNI_2x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (SI), Z8
+ VMOVDQU64 (DI), Z9
+ VMOVDQU64 (R8), Z10
+ VMOVDQU64 (BX), Z11
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
+ VXORPD Z8, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z8, Z13, Z8
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 4 outputs
+ VMOVDQU64 Z8, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), BX
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, BX
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, CX
+
+mulAvxGFNI_2x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (SI), Y8
+ VMOVDQU (DI), Y9
+ VMOVDQU (R8), Y10
+ VMOVDQU (BX), Y11
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
+ VXORPD Y8, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y8, Y13, Y8
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 4 outputs
+ VMOVDQU Y8, (SI)
+ ADDQ $0x20, SI
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (BX)
+ ADDQ $0x20, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x4Xor_end:
+ RET
+
+// func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), BX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, BX
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, CX
+
+mulGFNI_2x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z15
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z11
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z13
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z14
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
+ VXORPD Z10, Z16, Z10
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
+ VXORPD Z11, Z16, Z11
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Store 5 outputs
+ VMOVDQU64 Z10, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z11, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x5_64_loop
+ VZEROUPPER
+
+mulGFNI_2x5_64_end:
+ RET
+
+// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x5(SB), $0-88
+ // Loading 9 of 10 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, DX
+
+mulAvxGFNI_2x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x5_end:
+ RET
+
+// func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), BX
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, BX
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, CX
+
+mulGFNI_2x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (SI), Z10
+ VMOVDQU64 (DI), Z11
+ VMOVDQU64 (R8), Z12
+ VMOVDQU64 (R9), Z13
+ VMOVDQU64 (BX), Z14
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z15
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z16
+ VXORPD Z10, Z16, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z16
+ VXORPD Z11, Z16, Z11
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
+ VXORPD Z10, Z16, Z10
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
+ VXORPD Z11, Z16, Z11
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Store 5 outputs
+ VMOVDQU64 Z10, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z11, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88
+ // Loading 9 of 10 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, DX
+
+mulAvxGFNI_2x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU (R8), Y10
+ VMOVDQU (R9), Y11
+ VMOVDQU (R10), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x5Xor_end:
+ RET
+
+// func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x6_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), BX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, BX
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, CX
+
+mulGFNI_2x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z15
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z17
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z12, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z13, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z14, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 6 outputs
+ VMOVDQU64 Z12, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z13, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z14, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z15, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z16, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z17, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x6_64_loop
+ VZEROUPPER
+
+mulGFNI_2x6_64_end:
+ RET
+
+// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x6(SB), $0-88
+ // Loading 8 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, DX
+
+mulAvxGFNI_2x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x6_end:
+ RET
+
+// func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x6_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), BX
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, BX
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, CX
+
+mulGFNI_2x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (SI), Z12
+ VMOVDQU64 (DI), Z13
+ VMOVDQU64 (R8), Z14
+ VMOVDQU64 (R9), Z15
+ VMOVDQU64 (R10), Z16
+ VMOVDQU64 (BX), Z17
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
+ VXORPD Z12, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
+ VXORPD Z13, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z14, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z12, Z19, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z13, Z19, Z13
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z14, Z19, Z14
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 6 outputs
+ VMOVDQU64 Z12, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z13, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z14, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z15, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z16, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z17, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88
+ // Loading 8 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, DX
+
+mulAvxGFNI_2x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (DI), Y8
+ VMOVDQU (R8), Y9
+ VMOVDQU (R9), Y10
+ VMOVDQU (R10), Y11
+ VMOVDQU (R11), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x6Xor_end:
+ RET
+
+// func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x7_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), BX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, BX
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, CX
+
+mulGFNI_2x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z21
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z21, Z14
+ VGF2P8AFFINEQB $0x00, Z1, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z2, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z3, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z4, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z5, Z21, Z19
+ VGF2P8AFFINEQB $0x00, Z6, Z21, Z20
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (CX), Z21
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
+ VXORPD Z14, Z22, Z14
+ VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
+ VXORPD Z15, Z22, Z15
+ VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
+ VXORPD Z16, Z22, Z16
+ VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
+ VXORPD Z17, Z22, Z17
+ VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Store 7 outputs
+ VMOVDQU64 Z14, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z15, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z20, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x7_64_loop
+ VZEROUPPER
+
+mulGFNI_2x7_64_end:
+ RET
+
+// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x7(SB), $0-88
+ // Loading 7 of 14 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, DX
+
+mulAvxGFNI_2x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x7_end:
+ RET
+
+// func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x7_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), BX
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, BX
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, CX
+
+mulGFNI_2x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (SI), Z14
+ VMOVDQU64 (DI), Z15
+ VMOVDQU64 (R8), Z16
+ VMOVDQU64 (R9), Z17
+ VMOVDQU64 (R10), Z18
+ VMOVDQU64 (R11), Z19
+ VMOVDQU64 (BX), Z20
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z21
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z21, Z22
+ VXORPD Z14, Z22, Z14
+ VGF2P8AFFINEQB $0x00, Z1, Z21, Z22
+ VXORPD Z15, Z22, Z15
+ VGF2P8AFFINEQB $0x00, Z2, Z21, Z22
+ VXORPD Z16, Z22, Z16
+ VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
+ VXORPD Z17, Z22, Z17
+ VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (CX), Z21
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
+ VXORPD Z14, Z22, Z14
+ VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
+ VXORPD Z15, Z22, Z15
+ VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
+ VXORPD Z16, Z22, Z16
+ VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
+ VXORPD Z17, Z22, Z17
+ VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Store 7 outputs
+ VMOVDQU64 Z14, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z15, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z20, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88
+ // Loading 7 of 14 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, DX
+
+mulAvxGFNI_2x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (DI), Y7
+ VMOVDQU (R8), Y8
+ VMOVDQU (R9), Y9
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x7Xor_end:
+ RET
+
+// func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x8_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, CX
+
+mulGFNI_2x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z17
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z19
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z21
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z22
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z23
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z16, Z25, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z17, Z25, Z17
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 8 outputs
+ VMOVDQU64 Z16, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z17, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z18, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z19, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z20, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z21, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z22, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z23, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x8_64_loop
+ VZEROUPPER
+
+mulGFNI_2x8_64_end:
+ RET
+
+// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x8(SB), $0-88
+ // Loading 6 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), SI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, SI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, DX
+
+mulAvxGFNI_2x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x8_end:
+ RET
+
+// func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x8_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), BX
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, BX
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, CX
+
+mulGFNI_2x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (SI), Z16
+ VMOVDQU64 (DI), Z17
+ VMOVDQU64 (R8), Z18
+ VMOVDQU64 (R9), Z19
+ VMOVDQU64 (R10), Z20
+ VMOVDQU64 (R11), Z21
+ VMOVDQU64 (R12), Z22
+ VMOVDQU64 (BX), Z23
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
+ VXORPD Z16, Z25, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
+ VXORPD Z17, Z25, Z17
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z16, Z25, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z17, Z25, Z17
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 8 outputs
+ VMOVDQU64 Z16, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z17, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z18, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z19, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z20, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z21, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z22, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z23, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88
+ // Loading 6 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), SI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, SI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, DX
+
+mulAvxGFNI_2x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU (R8), Y7
+ VMOVDQU (R9), Y8
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x8Xor_end:
+ RET
+
+// func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x9_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, CX
+
+mulGFNI_2x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (DX), Z27
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z27, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z27, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z27, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z27, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z27, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z27, Z23
+ VGF2P8AFFINEQB $0x00, Z6, Z27, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z27, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z27, Z26
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (CX), Z27
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
+ VXORPD Z18, Z28, Z18
+ VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
+ VXORPD Z19, Z28, Z19
+ VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
+ VXORPD Z20, Z28, Z20
+ VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
+ VXORPD Z21, Z28, Z21
+ VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
+ VXORPD Z22, Z28, Z22
+ VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
+ VXORPD Z23, Z28, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Store 9 outputs
+ VMOVDQU64 Z18, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z19, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z20, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z21, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z22, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z23, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z24, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z25, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z26, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x9_64_loop
+ VZEROUPPER
+
+mulGFNI_2x9_64_end:
+ RET
+
+// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x9(SB), $0-88
+ // Loading 5 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), SI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, SI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, DX
+
+mulAvxGFNI_2x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x9_end:
+ RET
+
+// func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x9_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), BX
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, BX
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, CX
+
+mulGFNI_2x9_64Xor_loop:
+ // Load 9 outputs
+ VMOVDQU64 (SI), Z18
+ VMOVDQU64 (DI), Z19
+ VMOVDQU64 (R8), Z20
+ VMOVDQU64 (R9), Z21
+ VMOVDQU64 (R10), Z22
+ VMOVDQU64 (R11), Z23
+ VMOVDQU64 (R12), Z24
+ VMOVDQU64 (R13), Z25
+ VMOVDQU64 (BX), Z26
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (DX), Z27
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z27, Z28
+ VXORPD Z18, Z28, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z27, Z28
+ VXORPD Z19, Z28, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z27, Z28
+ VXORPD Z20, Z28, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
+ VXORPD Z21, Z28, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
+ VXORPD Z22, Z28, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
+ VXORPD Z23, Z28, Z23
+ VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (CX), Z27
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
+ VXORPD Z18, Z28, Z18
+ VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
+ VXORPD Z19, Z28, Z19
+ VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
+ VXORPD Z20, Z28, Z20
+ VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
+ VXORPD Z21, Z28, Z21
+ VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
+ VXORPD Z22, Z28, Z22
+ VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
+ VXORPD Z23, Z28, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Store 9 outputs
+ VMOVDQU64 Z18, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z19, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z20, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z21, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z22, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z23, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z24, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z25, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z26, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88
+ // Loading 5 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), SI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, SI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, DX
+
+mulAvxGFNI_2x9Xor_loop:
+ // Load 9 outputs
+ VMOVDQU (DI), Y5
+ VMOVDQU (R8), Y6
+ VMOVDQU (R9), Y7
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x9Xor_end:
+ RET
+
+// func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x10_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, CX
+
+mulGFNI_2x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ VMOVDQU64 Z20, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z21, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x10_64_loop
+ VZEROUPPER
+
+mulGFNI_2x10_64_end:
+ RET
+
+// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x10(SB), $8-88
+ // Loading 4 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, DX
+
+mulAvxGFNI_2x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x10_end:
+ RET
+
+// func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_2x10_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_2x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), CX
+ MOVQ out_base+48(FP), BX
+ MOVQ out_base+48(FP), BX
+ MOVQ (BX), SI
+ MOVQ 24(BX), DI
+ MOVQ 48(BX), R8
+ MOVQ 72(BX), R9
+ MOVQ 96(BX), R10
+ MOVQ 120(BX), R11
+ MOVQ 144(BX), R12
+ MOVQ 168(BX), R13
+ MOVQ 192(BX), R14
+ MOVQ 216(BX), BX
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, BX
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, CX
+
+mulGFNI_2x10_64Xor_loop:
+ // Load 10 outputs
+ VMOVDQU64 (SI), Z20
+ VMOVDQU64 (DI), Z21
+ VMOVDQU64 (R8), Z22
+ VMOVDQU64 (R9), Z23
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (BX), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ VMOVDQU64 Z20, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z21, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (BX)
+ ADDQ $0x40, BX
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_2x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_2x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88
+ // Loading 4 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_2x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), DX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, DX
+
+mulAvxGFNI_2x10Xor_loop:
+ // Load 10 outputs
+ VMOVDQU (DI), Y4
+ VMOVDQU (R8), Y5
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_2x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_2x10Xor_end:
+ RET
+
+// func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), SI
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, BX
+ ADDQ DI, CX
+
+mulGFNI_3x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z4
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z3
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z4
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (CX), Z4
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z2, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Store 1 outputs
+ VMOVDQU64 Z3, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x1_64_loop
+ VZEROUPPER
+
+mulGFNI_3x1_64_end:
+ RET
+
+// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), SI
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, BX
+ ADDQ DI, CX
+
+mulAvxGFNI_3x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y4
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y4, Y3
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y4
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (CX), Y4
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y2, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Store 1 outputs
+ VMOVDQU Y3, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x1_end:
+ RET
+
+// func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), SI
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, BX
+ ADDQ DI, CX
+
+mulGFNI_3x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (SI), Z3
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z4
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z4
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (CX), Z4
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z2, Z4, Z4
+ VXORPD Z3, Z4, Z3
+
+ // Store 1 outputs
+ VMOVDQU64 Z3, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 6 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), SI
+ MOVQ start+72(FP), DI
+
+ // Add start offset to output
+ ADDQ DI, SI
+
+ // Add start offset to input
+ ADDQ DI, DX
+ ADDQ DI, BX
+ ADDQ DI, CX
+
+mulAvxGFNI_3x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (SI), Y3
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y4
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y4
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (CX), Y4
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y2, Y4, Y4
+ VXORPD Y3, Y4, Y3
+
+ // Store 1 outputs
+ VMOVDQU Y3, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x1Xor_end:
+ RET
+
+// func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, CX
+
+mulGFNI_3x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z8
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z7
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z8
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z5, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Store 2 outputs
+ VMOVDQU64 Z6, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z7, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x2_64_loop
+ VZEROUPPER
+
+mulGFNI_3x2_64_end:
+ RET
+
+// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, CX
+
+mulAvxGFNI_3x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y7
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y5, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Store 2 outputs
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x2_end:
+ RET
+
+// func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, CX
+
+mulGFNI_3x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (DI), Z6
+ VMOVDQU64 (SI), Z7
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z8
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z8
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z8, Z9
+ VXORPD Z6, Z9, Z6
+ VGF2P8AFFINEQB $0x00, Z5, Z8, Z9
+ VXORPD Z7, Z9, Z7
+
+ // Store 2 outputs
+ VMOVDQU64 Z6, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z7, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), SI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+ ADDQ R8, SI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, CX
+
+mulAvxGFNI_3x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (DI), Y6
+ VMOVDQU (SI), Y7
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y8, Y9
+ VXORPD Y6, Y9, Y6
+ VGF2P8AFFINEQB $0x00, Y5, Y8, Y9
+ VXORPD Y7, Y9, Y7
+
+ // Store 2 outputs
+ VMOVDQU Y6, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y7, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x2Xor_end:
+ RET
+
+// func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, CX
+
+mulGFNI_3x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z9
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z10
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z11
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z12
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 3 outputs
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x3_64_loop
+ VZEROUPPER
+
+mulGFNI_3x3_64_end:
+ RET
+
+// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x3(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, CX
+
+mulAvxGFNI_3x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y11
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 3 outputs
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x3_end:
+ RET
+
+// func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, CX
+
+mulGFNI_3x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (DI), Z9
+ VMOVDQU64 (R8), Z10
+ VMOVDQU64 (SI), Z11
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z12
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z9, Z13, Z9
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 3 outputs
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z10, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z11, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), SI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, DI
+ ADDQ R9, R8
+ ADDQ R9, SI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, CX
+
+mulAvxGFNI_3x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (DI), Y9
+ VMOVDQU (R8), Y10
+ VMOVDQU (SI), Y11
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y9, Y13, Y9
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 3 outputs
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x3Xor_end:
+ RET
+
+// func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), SI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, SI
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, CX
+
+mulGFNI_3x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z16
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z15
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z16
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 4 outputs
+ VMOVDQU64 Z12, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z13, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z14, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z15, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x4_64_loop
+ VZEROUPPER
+
+mulGFNI_3x4_64_end:
+ RET
+
+// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x4(SB), $0-88
+ // Loading 10 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DX
+
+mulAvxGFNI_3x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x4_end:
+ RET
+
+// func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), SI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, SI
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, CX
+
+mulGFNI_3x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (DI), Z12
+ VMOVDQU64 (R8), Z13
+ VMOVDQU64 (R9), Z14
+ VMOVDQU64 (SI), Z15
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z16
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z16
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
+ VXORPD Z12, Z17, Z12
+ VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
+ VXORPD Z13, Z17, Z13
+ VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 4 outputs
+ VMOVDQU64 Z12, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z13, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z14, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z15, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88
+ // Loading 10 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DX
+
+mulAvxGFNI_3x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R8), Y10
+ VMOVDQU (R9), Y11
+ VMOVDQU (R10), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x4Xor_end:
+ RET
+
+// func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, CX
+
+mulGFNI_3x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z15
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z16
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z17
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z18
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z19
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 5 outputs
+ VMOVDQU64 Z15, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x5_64_loop
+ VZEROUPPER
+
+mulGFNI_3x5_64_end:
+ RET
+
+// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x5(SB), $0-88
+ // Loading 9 of 15 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DX
+
+mulAvxGFNI_3x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x5_end:
+ RET
+
+// func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), SI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, SI
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, CX
+
+mulGFNI_3x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (DI), Z15
+ VMOVDQU64 (R8), Z16
+ VMOVDQU64 (R9), Z17
+ VMOVDQU64 (R10), Z18
+ VMOVDQU64 (SI), Z19
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z15, Z21, Z15
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 5 outputs
+ VMOVDQU64 Z15, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88
+ // Loading 9 of 15 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DX
+
+mulAvxGFNI_3x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R8), Y9
+ VMOVDQU (R9), Y10
+ VMOVDQU (R10), Y11
+ VMOVDQU (R11), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x5Xor_end:
+ RET
+
+// func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x6_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, CX
+
+mulGFNI_3x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z23
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 6 outputs
+ VMOVDQU64 Z18, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z19, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z21, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z22, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z23, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x6_64_loop
+ VZEROUPPER
+
+mulGFNI_3x6_64_end:
+ RET
+
+// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x6(SB), $0-88
+ // Loading 8 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DX
+
+mulAvxGFNI_3x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x6_end:
+ RET
+
+// func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x6_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), SI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, SI
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, CX
+
+mulGFNI_3x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (DI), Z18
+ VMOVDQU64 (R8), Z19
+ VMOVDQU64 (R9), Z20
+ VMOVDQU64 (R10), Z21
+ VMOVDQU64 (R11), Z22
+ VMOVDQU64 (SI), Z23
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z18, Z25, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z19, Z25, Z19
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 6 outputs
+ VMOVDQU64 Z18, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z19, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z21, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z22, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z23, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88
+ // Loading 8 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DX
+
+mulAvxGFNI_3x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R8), Y8
+ VMOVDQU (R9), Y9
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x6Xor_end:
+ RET
+
+// func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x7_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, CX
+
+mulGFNI_3x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z28
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z28, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z28, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z28, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z28, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z28, Z27
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (BX), Z28
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (CX), Z28
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Store 7 outputs
+ VMOVDQU64 Z21, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x7_64_loop
+ VZEROUPPER
+
+mulGFNI_3x7_64_end:
+ RET
+
+// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x7(SB), $0-88
+ // Loading 7 of 21 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), DI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DX
+
+mulAvxGFNI_3x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x7_end:
+ RET
+
+// func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x7_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), CX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), SI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, SI
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, CX
+
+mulGFNI_3x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (DI), Z21
+ VMOVDQU64 (R8), Z22
+ VMOVDQU64 (R9), Z23
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (SI), Z27
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z28
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (BX), Z28
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (CX), Z28
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
+ VXORPD Z21, Z29, Z21
+ VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
+ VXORPD Z22, Z29, Z22
+ VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
+ VXORPD Z23, Z29, Z23
+ VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Store 7 outputs
+ VMOVDQU64 Z21, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88
+ // Loading 7 of 21 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), DI
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, DI
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DX
+
+mulAvxGFNI_3x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (R8), Y7
+ VMOVDQU (R9), Y8
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x7Xor_end:
+ RET
+
+// func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x8_64(SB), $0-88
+ // Loading 22 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+
+mulGFNI_3x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x8_64_loop
+ VZEROUPPER
+
+mulGFNI_3x8_64_end:
+ RET
+
+// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x8(SB), $0-88
+ // Loading 6 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+
+mulAvxGFNI_3x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x8_end:
+ RET
+
+// func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x8_64Xor(SB), $0-88
+ // Loading 22 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+
+mulGFNI_3x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (R8), Z22
+ VMOVDQU64 (R9), Z23
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (DI), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88
+ // Loading 6 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), DI
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, DI
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DX
+
+mulAvxGFNI_3x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (R8), Y6
+ VMOVDQU (R9), Y7
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x8Xor_end:
+ RET
+
+// func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x9_64(SB), $8-88
+ // Loading 21 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+
+mulGFNI_3x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x9_64_loop
+ VZEROUPPER
+
+mulGFNI_3x9_64_end:
+ RET
+
+// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x9(SB), $8-88
+ // Loading 5 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+
+mulAvxGFNI_3x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x9_end:
+ RET
+
+// func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x9_64Xor(SB), $8-88
+ // Loading 21 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+
+mulGFNI_3x9_64Xor_loop:
+ // Load 9 outputs
+ VMOVDQU64 (R8), Z21
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (DI), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_3x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88
+ // Loading 5 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DX
+
+mulAvxGFNI_3x9Xor_loop:
+ // Load 9 outputs
+ VMOVDQU (R8), Y5
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_3x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x9Xor_end:
+ RET
+
+// func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x10_64(SB), $8-88
+ // Loading 20 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_3x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ VMOVDQU64 Z20, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_3x10_64_loop
+ VZEROUPPER
+
+mulGFNI_3x10_64_end:
+ RET
+
+// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x10(SB), $8-88
+ // Loading 4 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_3x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_3x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x10_end:
+ RET
+
+// func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_3x10_64Xor(SB), $8-88
+ // Loading 20 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_3x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_3x10_64Xor_loop:
+ // Load 10 outputs
+ VMOVDQU64 (DI), Z20
+ VMOVDQU64 (R8), Z21
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (SI), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ VMOVDQU64 Z20, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (SI)
+ ADDQ $0x40, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_3x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_3x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88
+ // Loading 4 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_3x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), AX
+ MOVQ out_base+48(FP), SI
+ MOVQ out_base+48(FP), SI
+ MOVQ (SI), DI
+ MOVQ 24(SI), R8
+ MOVQ 48(SI), R9
+ MOVQ 72(SI), R10
+ MOVQ 96(SI), R11
+ MOVQ 120(SI), R12
+ MOVQ 144(SI), R13
+ MOVQ 168(SI), R14
+ MOVQ 192(SI), R15
+ MOVQ 216(SI), SI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, SI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_3x10Xor_loop:
+ // Load 10 outputs
+ VMOVDQU (DI), Y4
+ VMOVDQU (R8), Y5
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (SI), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (SI)
+ ADDQ $0x20, SI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_3x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_3x10Xor_end:
+ RET
+
+// func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 7 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, CX
+
+mulGFNI_4x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z5
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z4
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z5
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z5
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (CX), Z5
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z3, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Store 1 outputs
+ VMOVDQU64 Z4, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x1_64_loop
+ VZEROUPPER
+
+mulGFNI_4x1_64_end:
+ RET
+
+// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 7 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, CX
+
+mulAvxGFNI_4x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y5
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y5, Y4
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y5
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y5
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (CX), Y5
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y3, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Store 1 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x1_end:
+ RET
+
+// func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 7 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, CX
+
+mulGFNI_4x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (DI), Z4
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z5
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z5
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z5
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (CX), Z5
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z3, Z5, Z5
+ VXORPD Z4, Z5, Z4
+
+ // Store 1 outputs
+ VMOVDQU64 Z4, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 7 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), DI
+ MOVQ start+72(FP), R8
+
+ // Add start offset to output
+ ADDQ R8, DI
+
+ // Add start offset to input
+ ADDQ R8, DX
+ ADDQ R8, BX
+ ADDQ R8, SI
+ ADDQ R8, CX
+
+mulAvxGFNI_4x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (DI), Y4
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y5
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y5
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y5
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (CX), Y5
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y3, Y5, Y5
+ VXORPD Y4, Y5, Y4
+
+ // Store 1 outputs
+ VMOVDQU Y4, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x1Xor_end:
+ RET
+
+// func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, CX
+
+mulGFNI_4x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z10
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z9
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z10
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z10
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z5, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z7, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Store 2 outputs
+ VMOVDQU64 Z8, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x2_64_loop
+ VZEROUPPER
+
+mulGFNI_4x2_64_end:
+ RET
+
+// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, CX
+
+mulAvxGFNI_4x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y9
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y5, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Store 2 outputs
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x2_end:
+ RET
+
+// func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, CX
+
+mulGFNI_4x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R8), Z8
+ VMOVDQU64 (DI), Z9
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z10
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z10
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z10
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z5, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z10, Z11
+ VXORPD Z8, Z11, Z8
+ VGF2P8AFFINEQB $0x00, Z7, Z10, Z11
+ VXORPD Z9, Z11, Z9
+
+ // Store 2 outputs
+ VMOVDQU64 Z8, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z9, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), DI
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+ ADDQ R9, DI
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, CX
+
+mulAvxGFNI_4x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R8), Y8
+ VMOVDQU (DI), Y9
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y5, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y10, Y11
+ VXORPD Y8, Y11, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y10, Y11
+ VXORPD Y9, Y11, Y9
+
+ // Store 2 outputs
+ VMOVDQU Y8, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y9, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x2Xor_end:
+ RET
+
+// func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), DI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DI
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, CX
+
+mulGFNI_4x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z15
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z14
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z15
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z15
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z10, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z11, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Store 3 outputs
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x3_64_loop
+ VZEROUPPER
+
+mulGFNI_4x3_64_end:
+ RET
+
+// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x3(SB), $0-88
+ // Loading 11 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, DX
+
+mulAvxGFNI_4x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x3_end:
+ RET
+
+// func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), DI
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R8
+ ADDQ R10, R9
+ ADDQ R10, DI
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, CX
+
+mulGFNI_4x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R8), Z12
+ VMOVDQU64 (R9), Z13
+ VMOVDQU64 (DI), Z14
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z15
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z2, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z15
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z15
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (CX), Z15
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
+ VXORPD Z12, Z16, Z12
+ VGF2P8AFFINEQB $0x00, Z10, Z15, Z16
+ VXORPD Z13, Z16, Z13
+ VGF2P8AFFINEQB $0x00, Z11, Z15, Z16
+ VXORPD Z14, Z16, Z14
+
+ // Store 3 outputs
+ VMOVDQU64 Z12, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z14, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88
+ // Loading 11 of 12 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 17 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, DX
+
+mulAvxGFNI_4x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R9), Y11
+ VMOVDQU (R10), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x3Xor_end:
+ RET
+
+// func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, CX
+
+mulGFNI_4x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z17
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z19
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z20
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 4 outputs
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x4_64_loop
+ VZEROUPPER
+
+mulGFNI_4x4_64_end:
+ RET
+
+// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x4(SB), $0-88
+ // Loading 10 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, DX
+
+mulAvxGFNI_4x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x4_end:
+ RET
+
+// func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), DI
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, DI
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, CX
+
+mulGFNI_4x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R8), Z16
+ VMOVDQU64 (R9), Z17
+ VMOVDQU64 (R10), Z18
+ VMOVDQU64 (DI), Z19
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z20
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z16, Z21, Z16
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z17, Z21, Z17
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 4 outputs
+ VMOVDQU64 Z16, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z17, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88
+ // Loading 10 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, DX
+
+mulAvxGFNI_4x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R9), Y10
+ VMOVDQU (R10), Y11
+ VMOVDQU (R11), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x4Xor_end:
+ RET
+
+// func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 27 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, CX
+
+mulGFNI_4x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z25
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z25, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z25, Z24
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z25
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z6, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z7, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z8, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z9, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z25
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (CX), Z25
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z15, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z16, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z17, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z18, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z19, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Store 5 outputs
+ VMOVDQU64 Z20, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z21, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z22, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z23, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z24, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x5_64_loop
+ VZEROUPPER
+
+mulGFNI_4x5_64_end:
+ RET
+
+// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x5(SB), $0-88
+ // Loading 9 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 27 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, DX
+
+mulAvxGFNI_4x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x5_end:
+ RET
+
+// func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 27 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), DI
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, DI
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, CX
+
+mulGFNI_4x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R8), Z20
+ VMOVDQU64 (R9), Z21
+ VMOVDQU64 (R10), Z22
+ VMOVDQU64 (R11), Z23
+ VMOVDQU64 (DI), Z24
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z25
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z25
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z6, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z7, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z8, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z9, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z25
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (CX), Z25
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z15, Z25, Z26
+ VXORPD Z20, Z26, Z20
+ VGF2P8AFFINEQB $0x00, Z16, Z25, Z26
+ VXORPD Z21, Z26, Z21
+ VGF2P8AFFINEQB $0x00, Z17, Z25, Z26
+ VXORPD Z22, Z26, Z22
+ VGF2P8AFFINEQB $0x00, Z18, Z25, Z26
+ VXORPD Z23, Z26, Z23
+ VGF2P8AFFINEQB $0x00, Z19, Z25, Z26
+ VXORPD Z24, Z26, Z24
+
+ // Store 5 outputs
+ VMOVDQU64 Z20, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z21, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z22, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z23, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z24, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88
+ // Loading 9 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 27 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, DX
+
+mulAvxGFNI_4x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R9), Y9
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x5Xor_end:
+ RET
+
+// func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x6_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, CX
+
+mulGFNI_4x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z25, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z26, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z27, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z28, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x6_64_loop
+ VZEROUPPER
+
+mulGFNI_4x6_64_end:
+ RET
+
+// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x6(SB), $0-88
+ // Loading 8 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R8
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R8
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, DX
+
+mulAvxGFNI_4x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x6_end:
+ RET
+
+// func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x6_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), CX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), DI
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, DI
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, CX
+
+mulGFNI_4x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (R8), Z24
+ VMOVDQU64 (R9), Z25
+ VMOVDQU64 (R10), Z26
+ VMOVDQU64 (R11), Z27
+ VMOVDQU64 (R12), Z28
+ VMOVDQU64 (DI), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z25, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z26, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z27, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z28, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88
+ // Loading 8 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R8
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R8
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, DX
+
+mulAvxGFNI_4x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R9), Y8
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x6Xor_end:
+ RET
+
+// func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x7_64(SB), $0-88
+ // Loading 23 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+
+mulGFNI_4x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x7_64_loop
+ VZEROUPPER
+
+mulGFNI_4x7_64_end:
+ RET
+
+// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x7(SB), $0-88
+ // Loading 7 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+
+mulAvxGFNI_4x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x7_end:
+ RET
+
+// func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x7_64Xor(SB), $0-88
+ // Loading 23 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+
+mulGFNI_4x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (R9), Z23
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R8), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88
+ // Loading 7 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R8
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R8
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, DX
+
+mulAvxGFNI_4x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (R9), Y7
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x7Xor_end:
+ RET
+
+// func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x8_64(SB), $8-88
+ // Loading 22 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+
+mulGFNI_4x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x8_64_loop
+ VZEROUPPER
+
+mulGFNI_4x8_64_end:
+ RET
+
+// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x8(SB), $8-88
+ // Loading 6 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+
+mulAvxGFNI_4x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x8_end:
+ RET
+
+// func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x8_64Xor(SB), $8-88
+ // Loading 22 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+
+mulGFNI_4x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R8), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_4x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88
+ // Loading 6 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, DX
+
+mulAvxGFNI_4x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_4x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x8Xor_end:
+ RET
+
+// func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x9_64(SB), $8-88
+ // Loading 21 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_4x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_4x9_64_loop
+ VZEROUPPER
+
+mulGFNI_4x9_64_end:
+ RET
+
+// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x9(SB), $8-88
+ // Loading 5 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_4x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_4x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x9_end:
+ RET
+
+// func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x9_64Xor(SB), $8-88
+ // Loading 21 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_4x9_64Xor_loop:
+ // Load 9 outputs
+ VMOVDQU64 (R8), Z21
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (DI), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ VMOVDQU64 Z21, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (DI)
+ ADDQ $0x40, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_4x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88
+ // Loading 5 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), AX
+ MOVQ out_base+48(FP), DI
+ MOVQ out_base+48(FP), DI
+ MOVQ (DI), R8
+ MOVQ 24(DI), R9
+ MOVQ 48(DI), R10
+ MOVQ 72(DI), R11
+ MOVQ 96(DI), R12
+ MOVQ 120(DI), R13
+ MOVQ 144(DI), R14
+ MOVQ 168(DI), R15
+ MOVQ 192(DI), DI
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, DI
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_4x9Xor_loop:
+ // Load 9 outputs
+ VMOVDQU (R8), Y5
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (DI), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (DI)
+ ADDQ $0x20, DI
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_4x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x9Xor_end:
+ RET
+
+// func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x10_64(SB), $0-88
+ // Loading 20 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+
+mulGFNI_4x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU64 Z20, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU64 Z21, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU64 Z22, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU64 Z23, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU64 Z24, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU64 Z25, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU64 Z26, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU64 Z27, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU64 Z28, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU64 Z29, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R9
+ DECQ AX
+ JNZ mulGFNI_4x10_64_loop
+ VZEROUPPER
+
+mulGFNI_4x10_64_end:
+ RET
+
+// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x10(SB), $0-88
+ // Loading 4 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+
+mulAvxGFNI_4x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU Y4, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU Y5, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU Y6, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU Y7, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU Y8, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU Y9, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU Y10, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU Y11, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU Y12, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU Y13, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R9
+ DECQ AX
+ JNZ mulAvxGFNI_4x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x10_end:
+ RET
+
+// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_4x10_64Xor(SB), $0-88
+ // Loading 20 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_4x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+
+mulGFNI_4x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU64 (R10)(R9*1), Z20
+ MOVQ 24(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z21
+ MOVQ 48(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z22
+ MOVQ 72(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z23
+ MOVQ 96(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z24
+ MOVQ 120(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z25
+ MOVQ 144(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z26
+ MOVQ 168(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z27
+ MOVQ 192(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z28
+ MOVQ 216(R8), R10
+ VMOVDQU64 (R10)(R9*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU64 Z20, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU64 Z21, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU64 Z22, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU64 Z23, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU64 Z24, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU64 Z25, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU64 Z26, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU64 Z27, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU64 Z28, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU64 Z29, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R9
+ DECQ AX
+ JNZ mulGFNI_4x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_4x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88
+ // Loading 4 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_4x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), DX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to input
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, DX
+
+mulAvxGFNI_4x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU (R10)(R9*1), Y4
+ MOVQ 24(R8), R10
+ VMOVDQU (R10)(R9*1), Y5
+ MOVQ 48(R8), R10
+ VMOVDQU (R10)(R9*1), Y6
+ MOVQ 72(R8), R10
+ VMOVDQU (R10)(R9*1), Y7
+ MOVQ 96(R8), R10
+ VMOVDQU (R10)(R9*1), Y8
+ MOVQ 120(R8), R10
+ VMOVDQU (R10)(R9*1), Y9
+ MOVQ 144(R8), R10
+ VMOVDQU (R10)(R9*1), Y10
+ MOVQ 168(R8), R10
+ VMOVDQU (R10)(R9*1), Y11
+ MOVQ 192(R8), R10
+ VMOVDQU (R10)(R9*1), Y12
+ MOVQ 216(R8), R10
+ VMOVDQU (R10)(R9*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R8), R10
+ VMOVDQU Y4, (R10)(R9*1)
+ MOVQ 24(R8), R10
+ VMOVDQU Y5, (R10)(R9*1)
+ MOVQ 48(R8), R10
+ VMOVDQU Y6, (R10)(R9*1)
+ MOVQ 72(R8), R10
+ VMOVDQU Y7, (R10)(R9*1)
+ MOVQ 96(R8), R10
+ VMOVDQU Y8, (R10)(R9*1)
+ MOVQ 120(R8), R10
+ VMOVDQU Y9, (R10)(R9*1)
+ MOVQ 144(R8), R10
+ VMOVDQU Y10, (R10)(R9*1)
+ MOVQ 168(R8), R10
+ VMOVDQU Y11, (R10)(R9*1)
+ MOVQ 192(R8), R10
+ VMOVDQU Y12, (R10)(R9*1)
+ MOVQ 216(R8), R10
+ VMOVDQU Y13, (R10)(R9*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R9
+ DECQ AX
+ JNZ mulAvxGFNI_4x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_4x10Xor_end:
+ RET
+
+// func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, CX
+
+mulGFNI_5x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z6
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z5
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z6
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z6
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z6
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Store 1 outputs
+ VMOVDQU64 Z5, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x1_64_loop
+ VZEROUPPER
+
+mulGFNI_5x1_64_end:
+ RET
+
+// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, CX
+
+mulAvxGFNI_5x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y6
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y5
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y6
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y6
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y6
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Store 1 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x1_end:
+ RET
+
+// func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, CX
+
+mulGFNI_5x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R8), Z5
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z6
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z6
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z6
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z6
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (CX), Z6
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z4, Z6, Z6
+ VXORPD Z5, Z6, Z5
+
+ // Store 1 outputs
+ VMOVDQU64 Z5, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 8 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R8
+ MOVQ start+72(FP), R9
+
+ // Add start offset to output
+ ADDQ R9, R8
+
+ // Add start offset to input
+ ADDQ R9, DX
+ ADDQ R9, BX
+ ADDQ R9, SI
+ ADDQ R9, DI
+ ADDQ R9, CX
+
+mulAvxGFNI_5x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R8), Y5
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y6
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y6
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y6
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y6
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (CX), Y6
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y4, Y6, Y6
+ VXORPD Y5, Y6, Y5
+
+ // Store 1 outputs
+ VMOVDQU Y5, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x1Xor_end:
+ RET
+
+// func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, CX
+
+mulGFNI_5x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z11
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z12
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z12
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z12
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z9, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 2 outputs
+ VMOVDQU64 Z10, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z11, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x2_64_loop
+ VZEROUPPER
+
+mulGFNI_5x2_64_end:
+ RET
+
+// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, CX
+
+mulAvxGFNI_5x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y11
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 2 outputs
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x2_end:
+ RET
+
+// func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, CX
+
+mulGFNI_5x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R9), Z10
+ VMOVDQU64 (R8), Z11
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z12
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z12
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z12
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z12
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (CX), Z12
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
+ VXORPD Z10, Z13, Z10
+ VGF2P8AFFINEQB $0x00, Z9, Z12, Z13
+ VXORPD Z11, Z13, Z11
+
+ // Store 2 outputs
+ VMOVDQU64 Z10, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z11, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 14 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R8
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+ ADDQ R10, R8
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, CX
+
+mulAvxGFNI_5x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R9), Y10
+ VMOVDQU (R8), Y11
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y12
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y12
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y12
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (DI), Y12
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (CX), Y12
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
+ VXORPD Y10, Y13, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y12, Y13
+ VXORPD Y11, Y13, Y11
+
+ // Store 2 outputs
+ VMOVDQU Y10, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y11, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x2Xor_end:
+ RET
+
+// func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, CX
+
+mulGFNI_5x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z15
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z16
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z17
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z18
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z18
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z18
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 3 outputs
+ VMOVDQU64 Z15, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z16, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z17, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x3_64_loop
+ VZEROUPPER
+
+mulGFNI_5x3_64_end:
+ RET
+
+// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x3(SB), $0-88
+ // Loading 11 of 15 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, DX
+
+mulAvxGFNI_5x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x3_end:
+ RET
+
+// func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R8
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R9
+ ADDQ R11, R10
+ ADDQ R11, R8
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, CX
+
+mulGFNI_5x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R9), Z15
+ VMOVDQU64 (R10), Z16
+ VMOVDQU64 (R8), Z17
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z18
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z18
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z18
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
+ VXORPD Z15, Z19, Z15
+ VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 3 outputs
+ VMOVDQU64 Z15, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z16, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z17, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88
+ // Loading 11 of 15 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, DX
+
+mulAvxGFNI_5x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R10), Y11
+ VMOVDQU (R11), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y12, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x3Xor_end:
+ RET
+
+// func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, CX
+
+mulGFNI_5x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z23
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z24
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z24
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 4 outputs
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z21, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z22, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z23, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x4_64_loop
+ VZEROUPPER
+
+mulGFNI_5x4_64_end:
+ RET
+
+// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x4(SB), $0-88
+ // Loading 10 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, DX
+
+mulAvxGFNI_5x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x4_end:
+ RET
+
+// func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R8
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R8
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, CX
+
+mulGFNI_5x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R9), Z20
+ VMOVDQU64 (R10), Z21
+ VMOVDQU64 (R11), Z22
+ VMOVDQU64 (R8), Z23
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z24
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z24
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z20, Z25, Z20
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 4 outputs
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z21, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z22, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z23, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88
+ // Loading 10 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, DX
+
+mulAvxGFNI_5x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R10), Y10
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x4Xor_end:
+ RET
+
+// func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x5_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, CX
+
+mulGFNI_5x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z26, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z27, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z28, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x5_64_loop
+ VZEROUPPER
+
+mulGFNI_5x5_64_end:
+ RET
+
+// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x5(SB), $0-88
+ // Loading 9 of 25 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R9
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R9
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, DX
+
+mulAvxGFNI_5x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x5_end:
+ RET
+
+// func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x5_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), CX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R8
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R8
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, CX
+
+mulGFNI_5x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R9), Z25
+ VMOVDQU64 (R10), Z26
+ VMOVDQU64 (R11), Z27
+ VMOVDQU64 (R12), Z28
+ VMOVDQU64 (R8), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z26, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z27, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z28, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88
+ // Loading 9 of 25 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R9
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R9
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, DX
+
+mulAvxGFNI_5x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R10), Y9
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x5Xor_end:
+ RET
+
+// func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x6_64(SB), $0-88
+ // Loading 24 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+
+mulGFNI_5x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x6_64_loop
+ VZEROUPPER
+
+mulGFNI_5x6_64_end:
+ RET
+
+// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x6(SB), $0-88
+ // Loading 8 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+
+mulAvxGFNI_5x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x6_end:
+ RET
+
+// func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x6_64Xor(SB), $0-88
+ // Loading 24 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+
+mulGFNI_5x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R9), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88
+ // Loading 8 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R9
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R9
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, DX
+
+mulAvxGFNI_5x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R10), Y8
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x6Xor_end:
+ RET
+
+// func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x7_64(SB), $8-88
+ // Loading 23 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+
+mulGFNI_5x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x7_64_loop
+ VZEROUPPER
+
+mulGFNI_5x7_64_end:
+ RET
+
+// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x7(SB), $8-88
+ // Loading 7 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+
+mulAvxGFNI_5x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x7_end:
+ RET
+
+// func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x7_64Xor(SB), $8-88
+ // Loading 23 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+
+mulGFNI_5x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R9), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_5x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88
+ // Loading 7 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, DX
+
+mulAvxGFNI_5x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_5x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x7Xor_end:
+ RET
+
+// func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x8_64(SB), $8-88
+ // Loading 22 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_5x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_5x8_64_loop
+ VZEROUPPER
+
+mulGFNI_5x8_64_end:
+ RET
+
+// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x8(SB), $8-88
+ // Loading 6 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_5x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_5x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x8_end:
+ RET
+
+// func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x8_64Xor(SB), $8-88
+ // Loading 22 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_5x8_64Xor_loop:
+ // Load 8 outputs
+ VMOVDQU64 (R9), Z22
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R8), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ VMOVDQU64 Z22, (R9)
+ ADDQ $0x40, R9
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R8)
+ ADDQ $0x40, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_5x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88
+ // Loading 6 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), AX
+ MOVQ out_base+48(FP), R8
+ MOVQ out_base+48(FP), R8
+ MOVQ (R8), R9
+ MOVQ 24(R8), R10
+ MOVQ 48(R8), R11
+ MOVQ 72(R8), R12
+ MOVQ 96(R8), R13
+ MOVQ 120(R8), R14
+ MOVQ 144(R8), R15
+ MOVQ 168(R8), R8
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R8
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_5x8Xor_loop:
+ // Load 8 outputs
+ VMOVDQU (R9), Y6
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R8), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R8)
+ ADDQ $0x20, R8
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_5x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x8Xor_end:
+ RET
+
+// func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x9_64(SB), $0-88
+ // Loading 21 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulGFNI_5x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 Z21, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU64 Z22, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU64 Z23, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU64 Z24, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU64 Z25, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU64 Z26, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU64 Z27, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU64 Z28, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU64 Z29, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R10
+ DECQ AX
+ JNZ mulGFNI_5x9_64_loop
+ VZEROUPPER
+
+mulGFNI_5x9_64_end:
+ RET
+
+// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x9(SB), $0-88
+ // Loading 5 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulAvxGFNI_5x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y10, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y11, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y12, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y13, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxGFNI_5x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x9_end:
+ RET
+
+// func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x9_64Xor(SB), $0-88
+ // Loading 21 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulGFNI_5x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 (R11)(R10*1), Z21
+ MOVQ 24(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z22
+ MOVQ 48(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z23
+ MOVQ 72(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z24
+ MOVQ 96(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z25
+ MOVQ 120(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z26
+ MOVQ 144(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z27
+ MOVQ 168(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z28
+ MOVQ 192(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 Z21, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU64 Z22, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU64 Z23, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU64 Z24, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU64 Z25, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU64 Z26, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU64 Z27, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU64 Z28, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU64 Z29, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R10
+ DECQ AX
+ JNZ mulGFNI_5x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88
+ // Loading 5 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulAvxGFNI_5x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU (R11)(R10*1), Y5
+ MOVQ 24(R9), R11
+ VMOVDQU (R11)(R10*1), Y6
+ MOVQ 48(R9), R11
+ VMOVDQU (R11)(R10*1), Y7
+ MOVQ 72(R9), R11
+ VMOVDQU (R11)(R10*1), Y8
+ MOVQ 96(R9), R11
+ VMOVDQU (R11)(R10*1), Y9
+ MOVQ 120(R9), R11
+ VMOVDQU (R11)(R10*1), Y10
+ MOVQ 144(R9), R11
+ VMOVDQU (R11)(R10*1), Y11
+ MOVQ 168(R9), R11
+ VMOVDQU (R11)(R10*1), Y12
+ MOVQ 192(R9), R11
+ VMOVDQU (R11)(R10*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y10, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y11, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y12, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y13, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxGFNI_5x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x9Xor_end:
+ RET
+
+// func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x10_64(SB), $0-88
+ // Loading 20 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulGFNI_5x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 Z20, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU64 Z21, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU64 Z22, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU64 Z23, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU64 Z24, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU64 Z25, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU64 Z26, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU64 Z27, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU64 Z28, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU64 Z29, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R10
+ DECQ AX
+ JNZ mulGFNI_5x10_64_loop
+ VZEROUPPER
+
+mulGFNI_5x10_64_end:
+ RET
+
+// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x10(SB), $0-88
+ // Loading 4 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulAvxGFNI_5x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y4, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y10, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y11, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y12, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU Y13, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxGFNI_5x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x10_end:
+ RET
+
+// func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_5x10_64Xor(SB), $0-88
+ // Loading 20 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_5x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulGFNI_5x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 (R11)(R10*1), Z20
+ MOVQ 24(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z21
+ MOVQ 48(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z22
+ MOVQ 72(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z23
+ MOVQ 96(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z24
+ MOVQ 120(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z25
+ MOVQ 144(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z26
+ MOVQ 168(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z27
+ MOVQ 192(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z28
+ MOVQ 216(R9), R11
+ VMOVDQU64 (R11)(R10*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU64 Z20, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU64 Z21, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU64 Z22, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU64 Z23, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU64 Z24, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU64 Z25, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU64 Z26, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU64 Z27, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU64 Z28, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU64 Z29, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R10
+ DECQ AX
+ JNZ mulGFNI_5x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_5x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88
+ // Loading 4 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_5x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), DX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to input
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, DX
+
+mulAvxGFNI_5x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU (R11)(R10*1), Y4
+ MOVQ 24(R9), R11
+ VMOVDQU (R11)(R10*1), Y5
+ MOVQ 48(R9), R11
+ VMOVDQU (R11)(R10*1), Y6
+ MOVQ 72(R9), R11
+ VMOVDQU (R11)(R10*1), Y7
+ MOVQ 96(R9), R11
+ VMOVDQU (R11)(R10*1), Y8
+ MOVQ 120(R9), R11
+ VMOVDQU (R11)(R10*1), Y9
+ MOVQ 144(R9), R11
+ VMOVDQU (R11)(R10*1), Y10
+ MOVQ 168(R9), R11
+ VMOVDQU (R11)(R10*1), Y11
+ MOVQ 192(R9), R11
+ VMOVDQU (R11)(R10*1), Y12
+ MOVQ 216(R9), R11
+ VMOVDQU (R11)(R10*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R9), R11
+ VMOVDQU Y4, (R11)(R10*1)
+ MOVQ 24(R9), R11
+ VMOVDQU Y5, (R11)(R10*1)
+ MOVQ 48(R9), R11
+ VMOVDQU Y6, (R11)(R10*1)
+ MOVQ 72(R9), R11
+ VMOVDQU Y7, (R11)(R10*1)
+ MOVQ 96(R9), R11
+ VMOVDQU Y8, (R11)(R10*1)
+ MOVQ 120(R9), R11
+ VMOVDQU Y9, (R11)(R10*1)
+ MOVQ 144(R9), R11
+ VMOVDQU Y10, (R11)(R10*1)
+ MOVQ 168(R9), R11
+ VMOVDQU Y11, (R11)(R10*1)
+ MOVQ 192(R9), R11
+ VMOVDQU Y12, (R11)(R10*1)
+ MOVQ 216(R9), R11
+ VMOVDQU Y13, (R11)(R10*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R10
+ DECQ AX
+ JNZ mulAvxGFNI_5x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_5x10Xor_end:
+ RET
+
+// func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 9 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, CX
+
+mulGFNI_6x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z7
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z7, Z6
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z7
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z7
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z7
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z7
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (CX), Z7
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z5, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Store 1 outputs
+ VMOVDQU64 Z6, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x1_64_loop
+ VZEROUPPER
+
+mulGFNI_6x1_64_end:
+ RET
+
+// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 9 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, CX
+
+mulAvxGFNI_6x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y7, Y6
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (CX), Y7
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y5, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Store 1 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x1_end:
+ RET
+
+// func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 9 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, CX
+
+mulGFNI_6x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R9), Z6
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z7
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z7
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z7
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z7
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z7
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (CX), Z7
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z5, Z7, Z7
+ VXORPD Z6, Z7, Z6
+
+ // Store 1 outputs
+ VMOVDQU64 Z6, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 9 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R9
+ MOVQ start+72(FP), R10
+
+ // Add start offset to output
+ ADDQ R10, R9
+
+ // Add start offset to input
+ ADDQ R10, DX
+ ADDQ R10, BX
+ ADDQ R10, SI
+ ADDQ R10, DI
+ ADDQ R10, R8
+ ADDQ R10, CX
+
+mulAvxGFNI_6x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R9), Y6
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y7
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y7
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y7
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y7
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y7
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (CX), Y7
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y5, Y7, Y7
+ VXORPD Y6, Y7, Y6
+
+ // Store 1 outputs
+ VMOVDQU Y6, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x1Xor_end:
+ RET
+
+// func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, CX
+
+mulGFNI_6x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z14
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z14, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z14, Z13
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z14
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z14
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z14
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z14
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z9, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (CX), Z14
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z11, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Store 2 outputs
+ VMOVDQU64 Z12, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x2_64_loop
+ VZEROUPPER
+
+mulGFNI_6x2_64_end:
+ RET
+
+// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x2(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, CX
+
+mulAvxGFNI_6x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (CX), Y14
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x2_end:
+ RET
+
+// func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, CX
+
+mulGFNI_6x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R10), Z12
+ VMOVDQU64 (R9), Z13
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z14
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z1, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z14
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z14
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z14
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z7, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z14
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z9, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (CX), Z14
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z10, Z14, Z15
+ VXORPD Z12, Z15, Z12
+ VGF2P8AFFINEQB $0x00, Z11, Z14, Z15
+ VXORPD Z13, Z15, Z13
+
+ // Store 2 outputs
+ VMOVDQU64 Z12, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z13, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 16 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R9
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+ ADDQ R11, R9
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, CX
+
+mulAvxGFNI_6x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R10), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (CX), Y14
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x2Xor_end:
+ RET
+
+// func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, CX
+
+mulGFNI_6x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z21
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z21, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z21, Z20
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z21
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z21
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z21
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z21
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z14, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (CX), Z21
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z15, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z16, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z17, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Store 3 outputs
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x3_64_loop
+ VZEROUPPER
+
+mulGFNI_6x3_64_end:
+ RET
+
+// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x3(SB), $0-88
+ // Loading 11 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, DX
+
+mulAvxGFNI_6x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x3_end:
+ RET
+
+// func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R9
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R10
+ ADDQ R12, R11
+ ADDQ R12, R9
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, CX
+
+mulGFNI_6x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R10), Z18
+ VMOVDQU64 (R11), Z19
+ VMOVDQU64 (R9), Z20
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z21
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z2, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z21
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z21
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z21
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z21
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z14, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (CX), Z21
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z15, Z21, Z22
+ VXORPD Z18, Z22, Z18
+ VGF2P8AFFINEQB $0x00, Z16, Z21, Z22
+ VXORPD Z19, Z22, Z19
+ VGF2P8AFFINEQB $0x00, Z17, Z21, Z22
+ VXORPD Z20, Z22, Z20
+
+ // Store 3 outputs
+ VMOVDQU64 Z18, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z19, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z20, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88
+ // Loading 11 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 23 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, DX
+
+mulAvxGFNI_6x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R11), Y11
+ VMOVDQU (R12), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x3Xor_end:
+ RET
+
+// func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x4_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, CX
+
+mulGFNI_6x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z28
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z28, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z28, Z27
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z28
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z28
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z28
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R8), Z28
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (CX), Z28
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z21, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z22, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z23, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Store 4 outputs
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x4_64_loop
+ VZEROUPPER
+
+mulGFNI_6x4_64_end:
+ RET
+
+// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x4(SB), $0-88
+ // Loading 10 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R10
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R10
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, DX
+
+mulAvxGFNI_6x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x4_end:
+ RET
+
+// func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x4_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), CX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R9
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R9
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, CX
+
+mulGFNI_6x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R10), Z24
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R9), Z27
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z28
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z28
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z28
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z28
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R8), Z28
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (CX), Z28
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
+ VXORPD Z24, Z29, Z24
+ VGF2P8AFFINEQB $0x00, Z21, Z28, Z29
+ VXORPD Z25, Z29, Z25
+ VGF2P8AFFINEQB $0x00, Z22, Z28, Z29
+ VXORPD Z26, Z29, Z26
+ VGF2P8AFFINEQB $0x00, Z23, Z28, Z29
+ VXORPD Z27, Z29, Z27
+
+ // Store 4 outputs
+ VMOVDQU64 Z24, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88
+ // Loading 10 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 30 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R10
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R10
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, DX
+
+mulAvxGFNI_6x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R11), Y10
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x4Xor_end:
+ RET
+
+// func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x5_64(SB), $0-88
+ // Loading 25 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+
+mulGFNI_6x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x5_64_loop
+ VZEROUPPER
+
+mulGFNI_6x5_64_end:
+ RET
+
+// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x5(SB), $0-88
+ // Loading 9 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+
+mulAvxGFNI_6x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x5_end:
+ RET
+
+// func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x5_64Xor(SB), $0-88
+ // Loading 25 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+
+mulGFNI_6x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R11), Z25
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R10), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88
+ // Loading 9 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 37 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R10
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R10
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, DX
+
+mulAvxGFNI_6x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R11), Y9
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x5Xor_end:
+ RET
+
+// func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x6_64(SB), $8-88
+ // Loading 24 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+
+mulGFNI_6x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x6_64_loop
+ VZEROUPPER
+
+mulGFNI_6x6_64_end:
+ RET
+
+// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x6(SB), $8-88
+ // Loading 8 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+
+mulAvxGFNI_6x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x6_end:
+ RET
+
+// func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x6_64Xor(SB), $8-88
+ // Loading 24 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+
+mulGFNI_6x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R10), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_6x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88
+ // Loading 8 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 44 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, DX
+
+mulAvxGFNI_6x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_6x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x6Xor_end:
+ RET
+
+// func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x7_64(SB), $8-88
+ // Loading 23 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 51 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_6x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_6x7_64_loop
+ VZEROUPPER
+
+mulGFNI_6x7_64_end:
+ RET
+
+// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x7(SB), $8-88
+ // Loading 7 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 51 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_6x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_6x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x7_end:
+ RET
+
+// func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x7_64Xor(SB), $8-88
+ // Loading 23 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 51 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_6x7_64Xor_loop:
+ // Load 7 outputs
+ VMOVDQU64 (R10), Z23
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R9), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R9)
+ ADDQ $0x40, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_6x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88
+ // Loading 7 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 51 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), AX
+ MOVQ out_base+48(FP), R9
+ MOVQ out_base+48(FP), R9
+ MOVQ (R9), R10
+ MOVQ 24(R9), R11
+ MOVQ 48(R9), R12
+ MOVQ 72(R9), R13
+ MOVQ 96(R9), R14
+ MOVQ 120(R9), R15
+ MOVQ 144(R9), R9
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R9
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_6x7Xor_loop:
+ // Load 7 outputs
+ VMOVDQU (R10), Y7
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R9), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R9)
+ ADDQ $0x20, R9
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_6x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x7Xor_end:
+ RET
+
+// func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x8_64(SB), $0-88
+ // Loading 22 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x8_64_loop
+ VZEROUPPER
+
+mulGFNI_6x8_64_end:
+ RET
+
+// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x8(SB), $0-88
+ // Loading 6 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x8_end:
+ RET
+
+// func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x8_64Xor(SB), $0-88
+ // Loading 22 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 (R12)(R11*1), Z22
+ MOVQ 24(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z23
+ MOVQ 48(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z24
+ MOVQ 72(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z25
+ MOVQ 96(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z26
+ MOVQ 120(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z27
+ MOVQ 144(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z28
+ MOVQ 168(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88
+ // Loading 6 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU (R12)(R11*1), Y6
+ MOVQ 24(R10), R12
+ VMOVDQU (R12)(R11*1), Y7
+ MOVQ 48(R10), R12
+ VMOVDQU (R12)(R11*1), Y8
+ MOVQ 72(R10), R12
+ VMOVDQU (R12)(R11*1), Y9
+ MOVQ 96(R10), R12
+ VMOVDQU (R12)(R11*1), Y10
+ MOVQ 120(R10), R12
+ VMOVDQU (R12)(R11*1), Y11
+ MOVQ 144(R10), R12
+ VMOVDQU (R12)(R11*1), Y12
+ MOVQ 168(R10), R12
+ VMOVDQU (R12)(R11*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x8Xor_end:
+ RET
+
+// func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x9_64(SB), $0-88
+ // Loading 21 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z21, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x9_64_loop
+ VZEROUPPER
+
+mulGFNI_6x9_64_end:
+ RET
+
+// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x9(SB), $0-88
+ // Loading 5 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x9_end:
+ RET
+
+// func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x9_64Xor(SB), $0-88
+ // Loading 21 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 (R12)(R11*1), Z21
+ MOVQ 24(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z22
+ MOVQ 48(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z23
+ MOVQ 72(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z24
+ MOVQ 96(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z25
+ MOVQ 120(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z26
+ MOVQ 144(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z27
+ MOVQ 168(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z28
+ MOVQ 192(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z21, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88
+ // Loading 5 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU (R12)(R11*1), Y5
+ MOVQ 24(R10), R12
+ VMOVDQU (R12)(R11*1), Y6
+ MOVQ 48(R10), R12
+ VMOVDQU (R12)(R11*1), Y7
+ MOVQ 72(R10), R12
+ VMOVDQU (R12)(R11*1), Y8
+ MOVQ 96(R10), R12
+ VMOVDQU (R12)(R11*1), Y9
+ MOVQ 120(R10), R12
+ VMOVDQU (R12)(R11*1), Y10
+ MOVQ 144(R10), R12
+ VMOVDQU (R12)(R11*1), Y11
+ MOVQ 168(R10), R12
+ VMOVDQU (R12)(R11*1), Y12
+ MOVQ 192(R10), R12
+ VMOVDQU (R12)(R11*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x9Xor_end:
+ RET
+
+// func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x10_64(SB), $0-88
+ // Loading 20 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z20, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z21, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x10_64_loop
+ VZEROUPPER
+
+mulGFNI_6x10_64_end:
+ RET
+
+// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x10(SB), $0-88
+ // Loading 4 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x10_end:
+ RET
+
+// func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_6x10_64Xor(SB), $0-88
+ // Loading 20 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_6x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulGFNI_6x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 (R12)(R11*1), Z20
+ MOVQ 24(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z21
+ MOVQ 48(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z22
+ MOVQ 72(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z23
+ MOVQ 96(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z24
+ MOVQ 120(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z25
+ MOVQ 144(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z26
+ MOVQ 168(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z27
+ MOVQ 192(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z28
+ MOVQ 216(R10), R12
+ VMOVDQU64 (R12)(R11*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU64 Z20, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU64 Z21, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU64 Z22, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU64 Z23, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU64 Z24, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU64 Z25, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU64 Z26, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU64 Z27, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU64 Z28, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU64 Z29, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R11
+ DECQ AX
+ JNZ mulGFNI_6x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_6x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88
+ // Loading 4 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_6x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), DX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to input
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, DX
+
+mulAvxGFNI_6x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU (R12)(R11*1), Y4
+ MOVQ 24(R10), R12
+ VMOVDQU (R12)(R11*1), Y5
+ MOVQ 48(R10), R12
+ VMOVDQU (R12)(R11*1), Y6
+ MOVQ 72(R10), R12
+ VMOVDQU (R12)(R11*1), Y7
+ MOVQ 96(R10), R12
+ VMOVDQU (R12)(R11*1), Y8
+ MOVQ 120(R10), R12
+ VMOVDQU (R12)(R11*1), Y9
+ MOVQ 144(R10), R12
+ VMOVDQU (R12)(R11*1), Y10
+ MOVQ 168(R10), R12
+ VMOVDQU (R12)(R11*1), Y11
+ MOVQ 192(R10), R12
+ VMOVDQU (R12)(R11*1), Y12
+ MOVQ 216(R10), R12
+ VMOVDQU (R12)(R11*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R10), R12
+ VMOVDQU Y4, (R12)(R11*1)
+ MOVQ 24(R10), R12
+ VMOVDQU Y5, (R12)(R11*1)
+ MOVQ 48(R10), R12
+ VMOVDQU Y6, (R12)(R11*1)
+ MOVQ 72(R10), R12
+ VMOVDQU Y7, (R12)(R11*1)
+ MOVQ 96(R10), R12
+ VMOVDQU Y8, (R12)(R11*1)
+ MOVQ 120(R10), R12
+ VMOVDQU Y9, (R12)(R11*1)
+ MOVQ 144(R10), R12
+ VMOVDQU Y10, (R12)(R11*1)
+ MOVQ 168(R10), R12
+ VMOVDQU Y11, (R12)(R11*1)
+ MOVQ 192(R10), R12
+ VMOVDQU Y12, (R12)(R11*1)
+ MOVQ 216(R10), R12
+ VMOVDQU Y13, (R12)(R11*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R11
+ DECQ AX
+ JNZ mulAvxGFNI_6x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_6x10Xor_end:
+ RET
+
+// func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, CX
+
+mulGFNI_7x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z8
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z7
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z8
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z8
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z8
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z8
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z8
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Store 1 outputs
+ VMOVDQU64 Z7, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x1_64_loop
+ VZEROUPPER
+
+mulGFNI_7x1_64_end:
+ RET
+
+// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, CX
+
+mulAvxGFNI_7x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y7
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Store 1 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x1_end:
+ RET
+
+// func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, CX
+
+mulGFNI_7x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R10), Z7
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z8
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z8
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z8
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z8
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z8
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z8
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (CX), Z8
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z6, Z8, Z8
+ VXORPD Z7, Z8, Z7
+
+ // Store 1 outputs
+ VMOVDQU64 Z7, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 10 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R10
+ MOVQ start+72(FP), R11
+
+ // Add start offset to output
+ ADDQ R11, R10
+
+ // Add start offset to input
+ ADDQ R11, DX
+ ADDQ R11, BX
+ ADDQ R11, SI
+ ADDQ R11, DI
+ ADDQ R11, R8
+ ADDQ R11, R9
+ ADDQ R11, CX
+
+mulAvxGFNI_7x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R10), Y7
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y8
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y8
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y8
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y8
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y8
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y8
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (CX), Y8
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y6, Y8, Y8
+ VXORPD Y7, Y8, Y7
+
+ // Store 1 outputs
+ VMOVDQU Y7, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x1Xor_end:
+ RET
+
+// func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R10
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+ ADDQ R12, R10
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, CX
+
+mulGFNI_7x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z16
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z14
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z15
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z16
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z16
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z16
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z16
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z16
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z13, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 2 outputs
+ VMOVDQU64 Z14, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z15, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x2_64_loop
+ VZEROUPPER
+
+mulGFNI_7x2_64_end:
+ RET
+
+// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x2(SB), $0-88
+ // Loading 12 of 14 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, DX
+
+mulAvxGFNI_7x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x2_end:
+ RET
+
+// func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R10
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+ ADDQ R12, R10
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, CX
+
+mulGFNI_7x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R11), Z14
+ VMOVDQU64 (R10), Z15
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z16
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z16
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z16
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z16
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z16
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z16
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (CX), Z16
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z12, Z16, Z17
+ VXORPD Z14, Z17, Z14
+ VGF2P8AFFINEQB $0x00, Z13, Z16, Z17
+ VXORPD Z15, Z17, Z15
+
+ // Store 2 outputs
+ VMOVDQU64 Z14, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z15, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88
+ // Loading 12 of 14 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 18 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, DX
+
+mulAvxGFNI_7x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R12), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x2Xor_end:
+ RET
+
+// func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, CX
+
+mulGFNI_7x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z23
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z24
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z24
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z24
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z24
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 3 outputs
+ VMOVDQU64 Z21, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z22, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x3_64_loop
+ VZEROUPPER
+
+mulGFNI_7x3_64_end:
+ RET
+
+// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x3(SB), $0-88
+ // Loading 11 of 21 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, DX
+
+mulAvxGFNI_7x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x3_end:
+ RET
+
+// func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), CX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R10
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R11
+ ADDQ R13, R12
+ ADDQ R13, R10
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, CX
+
+mulGFNI_7x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R11), Z21
+ VMOVDQU64 (R12), Z22
+ VMOVDQU64 (R10), Z23
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z24
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z24
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z24
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z24
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z24
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z24
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (CX), Z24
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
+ VXORPD Z21, Z25, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
+ VXORPD Z22, Z25, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z24, Z25
+ VXORPD Z23, Z25, Z23
+
+ // Store 3 outputs
+ VMOVDQU64 Z21, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z22, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z23, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88
+ // Loading 11 of 21 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 26 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, DX
+
+mulAvxGFNI_7x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R12), Y11
+ VMOVDQU (R13), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x3Xor_end:
+ RET
+
+// func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x4_64(SB), $0-88
+ // Loading 26 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+
+mulGFNI_7x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x4_64_loop
+ VZEROUPPER
+
+mulGFNI_7x4_64_end:
+ RET
+
+// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x4(SB), $0-88
+ // Loading 10 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+
+mulAvxGFNI_7x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x4_end:
+ RET
+
+// func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x4_64Xor(SB), $0-88
+ // Loading 26 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+
+mulGFNI_7x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R12), Z26
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R11), Z29
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88
+ // Loading 10 of 28 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 34 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R11
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R11
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, DX
+
+mulAvxGFNI_7x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R12), Y10
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x4Xor_end:
+ RET
+
+// func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x5_64(SB), $8-88
+ // Loading 25 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+
+mulGFNI_7x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x5_64_loop
+ VZEROUPPER
+
+mulGFNI_7x5_64_end:
+ RET
+
+// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x5(SB), $8-88
+ // Loading 9 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+
+mulAvxGFNI_7x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x5_end:
+ RET
+
+// func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x5_64Xor(SB), $8-88
+ // Loading 25 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+
+mulGFNI_7x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R11), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_7x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88
+ // Loading 9 of 35 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, DX
+
+mulAvxGFNI_7x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_7x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x5Xor_end:
+ RET
+
+// func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x6_64(SB), $8-88
+ // Loading 24 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_7x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_7x6_64_loop
+ VZEROUPPER
+
+mulGFNI_7x6_64_end:
+ RET
+
+// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x6(SB), $8-88
+ // Loading 8 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_7x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_7x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x6_end:
+ RET
+
+// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x6_64Xor(SB), $8-88
+ // Loading 24 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_7x6_64Xor_loop:
+ // Load 6 outputs
+ VMOVDQU64 (R11), Z24
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R10), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ VMOVDQU64 Z24, (R11)
+ ADDQ $0x40, R11
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R10)
+ ADDQ $0x40, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_7x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88
+ // Loading 8 of 42 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 50 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), AX
+ MOVQ out_base+48(FP), R10
+ MOVQ out_base+48(FP), R10
+ MOVQ (R10), R11
+ MOVQ 24(R10), R12
+ MOVQ 48(R10), R13
+ MOVQ 72(R10), R14
+ MOVQ 96(R10), R15
+ MOVQ 120(R10), R10
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R10
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_7x6Xor_loop:
+ // Load 6 outputs
+ VMOVDQU (R11), Y8
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R10), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R10)
+ ADDQ $0x20, R10
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_7x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x6Xor_end:
+ RET
+
+// func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x7_64(SB), $0-88
+ // Loading 23 of 49 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x7_64_loop
+ VZEROUPPER
+
+mulGFNI_7x7_64_end:
+ RET
+
+// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x7(SB), $0-88
+ // Loading 7 of 49 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x7_end:
+ RET
+
+// func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x7_64Xor(SB), $0-88
+ // Loading 23 of 49 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x7_64Xor_loop:
+ // Load 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 (R13)(R12*1), Z23
+ MOVQ 24(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z24
+ MOVQ 48(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z25
+ MOVQ 72(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z26
+ MOVQ 96(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z27
+ MOVQ 120(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z28
+ MOVQ 144(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88
+ // Loading 7 of 49 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 58 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x7Xor_loop:
+ // Load 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y9
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y10
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y11
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y12
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x7Xor_end:
+ RET
+
+// func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x8_64(SB), $0-88
+ // Loading 22 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 66 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x8_64_loop
+ VZEROUPPER
+
+mulGFNI_7x8_64_end:
+ RET
+
+// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x8(SB), $0-88
+ // Loading 6 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 66 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x8_end:
+ RET
+
+// func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x8_64Xor(SB), $0-88
+ // Loading 22 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 66 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 (R13)(R12*1), Z22
+ MOVQ 24(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z23
+ MOVQ 48(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z24
+ MOVQ 72(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z25
+ MOVQ 96(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z26
+ MOVQ 120(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z27
+ MOVQ 144(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z28
+ MOVQ 168(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88
+ // Loading 6 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 66 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y9
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y10
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y11
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y12
+ MOVQ 168(R11), R13
+ VMOVDQU (R13)(R12*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x8Xor_end:
+ RET
+
+// func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x9_64(SB), $0-88
+ // Loading 21 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z21, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x9_64_loop
+ VZEROUPPER
+
+mulGFNI_7x9_64_end:
+ RET
+
+// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x9(SB), $0-88
+ // Loading 5 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x9_end:
+ RET
+
+// func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x9_64Xor(SB), $0-88
+ // Loading 21 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 (R13)(R12*1), Z21
+ MOVQ 24(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z22
+ MOVQ 48(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z23
+ MOVQ 72(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z24
+ MOVQ 96(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z25
+ MOVQ 120(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z26
+ MOVQ 144(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z27
+ MOVQ 168(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z28
+ MOVQ 192(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z21, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88
+ // Loading 5 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y5
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y9
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y10
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y11
+ MOVQ 168(R11), R13
+ VMOVDQU (R13)(R12*1), Y12
+ MOVQ 192(R11), R13
+ VMOVDQU (R13)(R12*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x9Xor_end:
+ RET
+
+// func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x10_64(SB), $0-88
+ // Loading 20 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z20, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z21, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x10_64_loop
+ VZEROUPPER
+
+mulGFNI_7x10_64_end:
+ RET
+
+// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x10(SB), $0-88
+ // Loading 4 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x10_end:
+ RET
+
+// func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_7x10_64Xor(SB), $0-88
+ // Loading 20 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_7x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulGFNI_7x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 (R13)(R12*1), Z20
+ MOVQ 24(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z21
+ MOVQ 48(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z22
+ MOVQ 72(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z23
+ MOVQ 96(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z24
+ MOVQ 120(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z25
+ MOVQ 144(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z26
+ MOVQ 168(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z27
+ MOVQ 192(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z28
+ MOVQ 216(R11), R13
+ VMOVDQU64 (R13)(R12*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU64 Z20, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU64 Z21, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU64 Z22, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU64 Z23, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU64 Z24, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU64 Z25, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU64 Z26, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU64 Z27, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU64 Z28, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU64 Z29, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R12
+ DECQ AX
+ JNZ mulGFNI_7x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_7x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88
+ // Loading 4 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_7x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), DX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to input
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, DX
+
+mulAvxGFNI_7x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU (R13)(R12*1), Y4
+ MOVQ 24(R11), R13
+ VMOVDQU (R13)(R12*1), Y5
+ MOVQ 48(R11), R13
+ VMOVDQU (R13)(R12*1), Y6
+ MOVQ 72(R11), R13
+ VMOVDQU (R13)(R12*1), Y7
+ MOVQ 96(R11), R13
+ VMOVDQU (R13)(R12*1), Y8
+ MOVQ 120(R11), R13
+ VMOVDQU (R13)(R12*1), Y9
+ MOVQ 144(R11), R13
+ VMOVDQU (R13)(R12*1), Y10
+ MOVQ 168(R11), R13
+ VMOVDQU (R13)(R12*1), Y11
+ MOVQ 192(R11), R13
+ VMOVDQU (R13)(R12*1), Y12
+ MOVQ 216(R11), R13
+ VMOVDQU (R13)(R12*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R11), R13
+ VMOVDQU Y4, (R13)(R12*1)
+ MOVQ 24(R11), R13
+ VMOVDQU Y5, (R13)(R12*1)
+ MOVQ 48(R11), R13
+ VMOVDQU Y6, (R13)(R12*1)
+ MOVQ 72(R11), R13
+ VMOVDQU Y7, (R13)(R12*1)
+ MOVQ 96(R11), R13
+ VMOVDQU Y8, (R13)(R12*1)
+ MOVQ 120(R11), R13
+ VMOVDQU Y9, (R13)(R12*1)
+ MOVQ 144(R11), R13
+ VMOVDQU Y10, (R13)(R12*1)
+ MOVQ 168(R11), R13
+ VMOVDQU Y11, (R13)(R12*1)
+ MOVQ 192(R11), R13
+ VMOVDQU Y12, (R13)(R12*1)
+ MOVQ 216(R11), R13
+ VMOVDQU Y13, (R13)(R12*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R12
+ DECQ AX
+ JNZ mulAvxGFNI_7x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_7x10Xor_end:
+ RET
+
+// func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, CX
+
+mulGFNI_8x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z9
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z8
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z9
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z9
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z9
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z9
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z9
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z9
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z7, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Store 1 outputs
+ VMOVDQU64 Z8, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x1_64_loop
+ VZEROUPPER
+
+mulGFNI_8x1_64_end:
+ RET
+
+// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, CX
+
+mulAvxGFNI_8x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y8
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y7, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Store 1 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x1_end:
+ RET
+
+// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, CX
+
+mulGFNI_8x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R11), Z8
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z9
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z9
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z9
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z9
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z9
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z9
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z9
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (CX), Z9
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z7, Z9, Z9
+ VXORPD Z8, Z9, Z8
+
+ // Store 1 outputs
+ VMOVDQU64 Z8, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 11 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R11
+ MOVQ start+72(FP), R12
+
+ // Add start offset to output
+ ADDQ R12, R11
+
+ // Add start offset to input
+ ADDQ R12, DX
+ ADDQ R12, BX
+ ADDQ R12, SI
+ ADDQ R12, DI
+ ADDQ R12, R8
+ ADDQ R12, R9
+ ADDQ R12, R10
+ ADDQ R12, CX
+
+mulAvxGFNI_8x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R11), Y8
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y9
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y9
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y9
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y9
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y9
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y9
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y9
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (CX), Y9
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y7, Y9, Y9
+ VXORPD Y8, Y9, Y8
+
+ // Store 1 outputs
+ VMOVDQU Y8, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x1Xor_end:
+ RET
+
+// func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, CX
+
+mulGFNI_8x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z17
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z18
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z18
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z18
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z18
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z18
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z18
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z15, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 2 outputs
+ VMOVDQU64 Z16, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z17, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x2_64_loop
+ VZEROUPPER
+
+mulGFNI_8x2_64_end:
+ RET
+
+// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x2(SB), $0-88
+ // Loading 12 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, DX
+
+mulAvxGFNI_8x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x2_end:
+ RET
+
+// func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R11
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+ ADDQ R13, R11
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, CX
+
+mulGFNI_8x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R12), Z16
+ VMOVDQU64 (R11), Z17
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z18
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z18
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z18
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z18
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z18
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z18
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z18
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (CX), Z18
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
+ VXORPD Z16, Z19, Z16
+ VGF2P8AFFINEQB $0x00, Z15, Z18, Z19
+ VXORPD Z17, Z19, Z17
+
+ // Store 2 outputs
+ VMOVDQU64 Z16, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z17, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88
+ // Loading 12 of 16 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 20 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, DX
+
+mulAvxGFNI_8x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R13), Y12
+ VMOVDQU (R12), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x2Xor_end:
+ RET
+
+// func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, CX
+
+mulGFNI_8x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z27
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z27, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z27, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z27, Z26
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z27
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z27
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z27
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z27
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z27
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z27
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (CX), Z27
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z21, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z22, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z23, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Store 3 outputs
+ VMOVDQU64 Z24, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z25, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z26, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x3_64_loop
+ VZEROUPPER
+
+mulGFNI_8x3_64_end:
+ RET
+
+// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x3(SB), $0-88
+ // Loading 11 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, DX
+
+mulAvxGFNI_8x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x3_end:
+ RET
+
+// func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), CX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R11
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R12
+ ADDQ R14, R13
+ ADDQ R14, R11
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, CX
+
+mulGFNI_8x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R12), Z24
+ VMOVDQU64 (R13), Z25
+ VMOVDQU64 (R11), Z26
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z27
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z27
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z27
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z27
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z27
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z27
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z27
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (CX), Z27
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z21, Z27, Z28
+ VXORPD Z24, Z28, Z24
+ VGF2P8AFFINEQB $0x00, Z22, Z27, Z28
+ VXORPD Z25, Z28, Z25
+ VGF2P8AFFINEQB $0x00, Z23, Z27, Z28
+ VXORPD Z26, Z28, Z26
+
+ // Store 3 outputs
+ VMOVDQU64 Z24, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z25, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z26, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88
+ // Loading 11 of 24 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 29 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, DX
+
+mulAvxGFNI_8x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R13), Y11
+ VMOVDQU (R14), Y12
+ VMOVDQU (R12), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x3Xor_end:
+ RET
+
+// func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x4_64(SB), $8-88
+ // Loading 26 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+
+mulGFNI_8x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x4_64_loop
+ VZEROUPPER
+
+mulGFNI_8x4_64_end:
+ RET
+
+// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x4(SB), $8-88
+ // Loading 10 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+
+mulAvxGFNI_8x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x4_end:
+ RET
+
+// func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x4_64Xor(SB), $8-88
+ // Loading 26 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+
+mulGFNI_8x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R12), Z29
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_8x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88
+ // Loading 10 of 32 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 38 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, DX
+
+mulAvxGFNI_8x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R12), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_8x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x4Xor_end:
+ RET
+
+// func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x5_64(SB), $8-88
+ // Loading 25 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_8x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_8x5_64_loop
+ VZEROUPPER
+
+mulGFNI_8x5_64_end:
+ RET
+
+// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x5(SB), $8-88
+ // Loading 9 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_8x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_8x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x5_end:
+ RET
+
+// func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x5_64Xor(SB), $8-88
+ // Loading 25 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_8x5_64Xor_loop:
+ // Load 5 outputs
+ VMOVDQU64 (R12), Z25
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R11), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ VMOVDQU64 Z25, (R12)
+ ADDQ $0x40, R12
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R11)
+ ADDQ $0x40, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_8x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88
+ // Loading 9 of 40 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 47 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), AX
+ MOVQ out_base+48(FP), R11
+ MOVQ out_base+48(FP), R11
+ MOVQ (R11), R12
+ MOVQ 24(R11), R13
+ MOVQ 48(R11), R14
+ MOVQ 72(R11), R15
+ MOVQ 96(R11), R11
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R11
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_8x5Xor_loop:
+ // Load 5 outputs
+ VMOVDQU (R12), Y9
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R11), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R11)
+ ADDQ $0x20, R11
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_8x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x5Xor_end:
+ RET
+
+// func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x6_64(SB), $0-88
+ // Loading 24 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x6_64_loop
+ VZEROUPPER
+
+mulGFNI_8x6_64_end:
+ RET
+
+// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x6(SB), $0-88
+ // Loading 8 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x6_end:
+ RET
+
+// func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x6_64Xor(SB), $0-88
+ // Loading 24 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x6_64Xor_loop:
+ // Load 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88
+ // Loading 8 of 48 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 56 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x6Xor_loop:
+ // Load 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x6Xor_end:
+ RET
+
+// func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x7_64(SB), $0-88
+ // Loading 23 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x7_64_loop
+ VZEROUPPER
+
+mulGFNI_8x7_64_end:
+ RET
+
+// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x7(SB), $0-88
+ // Loading 7 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x7_end:
+ RET
+
+// func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x7_64Xor(SB), $0-88
+ // Loading 23 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x7_64Xor_loop:
+ // Load 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z23
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 144(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88
+ // Loading 7 of 56 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 65 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x7Xor_loop:
+ // Load 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x7Xor_end:
+ RET
+
+// func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x8_64(SB), $0-88
+ // Loading 22 of 64 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x8_64_loop
+ VZEROUPPER
+
+mulGFNI_8x8_64_end:
+ RET
+
+// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x8(SB), $0-88
+ // Loading 6 of 64 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x8_end:
+ RET
+
+// func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x8_64Xor(SB), $0-88
+ // Loading 22 of 64 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z22
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z23
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 144(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 168(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88
+ // Loading 6 of 64 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 74 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 168(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x8Xor_end:
+ RET
+
+// func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x9_64(SB), $0-88
+ // Loading 21 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z21, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x9_64_loop
+ VZEROUPPER
+
+mulGFNI_8x9_64_end:
+ RET
+
+// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x9(SB), $0-88
+ // Loading 5 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x9_end:
+ RET
+
+// func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x9_64Xor(SB), $0-88
+ // Loading 21 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z21
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z22
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z23
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 144(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 168(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 192(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z21, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88
+ // Loading 5 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 83 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y5
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 168(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 192(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x9Xor_end:
+ RET
+
+// func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x10_64(SB), $0-88
+ // Loading 20 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z20, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z21, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x10_64_loop
+ VZEROUPPER
+
+mulGFNI_8x10_64_end:
+ RET
+
+// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x10(SB), $0-88
+ // Loading 4 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x10_end:
+ RET
+
+// func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_8x10_64Xor(SB), $0-88
+ // Loading 20 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_8x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulGFNI_8x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 (R14)(R13*1), Z20
+ MOVQ 24(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z21
+ MOVQ 48(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z22
+ MOVQ 72(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z23
+ MOVQ 96(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z24
+ MOVQ 120(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z25
+ MOVQ 144(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z26
+ MOVQ 168(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z27
+ MOVQ 192(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z28
+ MOVQ 216(R12), R14
+ VMOVDQU64 (R14)(R13*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU64 Z20, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU64 Z21, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU64 Z22, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU64 Z23, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU64 Z24, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU64 Z25, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU64 Z26, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU64 Z27, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU64 Z28, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU64 Z29, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R13
+ DECQ AX
+ JNZ mulGFNI_8x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_8x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88
+ // Loading 4 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_8x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), DX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to input
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, DX
+
+mulAvxGFNI_8x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU (R14)(R13*1), Y4
+ MOVQ 24(R12), R14
+ VMOVDQU (R14)(R13*1), Y5
+ MOVQ 48(R12), R14
+ VMOVDQU (R14)(R13*1), Y6
+ MOVQ 72(R12), R14
+ VMOVDQU (R14)(R13*1), Y7
+ MOVQ 96(R12), R14
+ VMOVDQU (R14)(R13*1), Y8
+ MOVQ 120(R12), R14
+ VMOVDQU (R14)(R13*1), Y9
+ MOVQ 144(R12), R14
+ VMOVDQU (R14)(R13*1), Y10
+ MOVQ 168(R12), R14
+ VMOVDQU (R14)(R13*1), Y11
+ MOVQ 192(R12), R14
+ VMOVDQU (R14)(R13*1), Y12
+ MOVQ 216(R12), R14
+ VMOVDQU (R14)(R13*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R12), R14
+ VMOVDQU Y4, (R14)(R13*1)
+ MOVQ 24(R12), R14
+ VMOVDQU Y5, (R14)(R13*1)
+ MOVQ 48(R12), R14
+ VMOVDQU Y6, (R14)(R13*1)
+ MOVQ 72(R12), R14
+ VMOVDQU Y7, (R14)(R13*1)
+ MOVQ 96(R12), R14
+ VMOVDQU Y8, (R14)(R13*1)
+ MOVQ 120(R12), R14
+ VMOVDQU Y9, (R14)(R13*1)
+ MOVQ 144(R12), R14
+ VMOVDQU Y10, (R14)(R13*1)
+ MOVQ 168(R12), R14
+ VMOVDQU Y11, (R14)(R13*1)
+ MOVQ 192(R12), R14
+ VMOVDQU Y12, (R14)(R13*1)
+ MOVQ 216(R12), R14
+ VMOVDQU Y13, (R14)(R13*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R13
+ DECQ AX
+ JNZ mulAvxGFNI_8x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_8x10Xor_end:
+ RET
+
+// func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, CX
+
+mulGFNI_9x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z10
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z9
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z10
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z10
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z10
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z10
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z10
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z10
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (R11), Z10
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z7, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Store 1 outputs
+ VMOVDQU64 Z9, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x1_64_loop
+ VZEROUPPER
+
+mulGFNI_9x1_64_end:
+ RET
+
+// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, CX
+
+mulAvxGFNI_9x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y9
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VGF2P8AFFINEQB $0x00, Y7, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 8 to 1 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y8, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Store 1 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x1_end:
+ RET
+
+// func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, CX
+
+mulGFNI_9x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R12), Z9
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z10
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z10
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z10
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z10
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z10
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z10
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z10
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (R11), Z10
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z7, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU64 (CX), Z10
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z8, Z10, Z10
+ VXORPD Z9, Z10, Z9
+
+ // Store 1 outputs
+ VMOVDQU64 Z9, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 12 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R12
+ MOVQ start+72(FP), R13
+
+ // Add start offset to output
+ ADDQ R13, R12
+
+ // Add start offset to input
+ ADDQ R13, DX
+ ADDQ R13, BX
+ ADDQ R13, SI
+ ADDQ R13, DI
+ ADDQ R13, R8
+ ADDQ R13, R9
+ ADDQ R13, R10
+ ADDQ R13, R11
+ ADDQ R13, CX
+
+mulAvxGFNI_9x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R12), Y9
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y10
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y10
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y10
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y10
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y10
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y10
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y10
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (R11), Y10
+ ADDQ $0x20, R11
+ VGF2P8AFFINEQB $0x00, Y7, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Load and process 32 bytes from input 8 to 1 outputs
+ VMOVDQU (CX), Y10
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y8, Y10, Y10
+ VXORPD Y9, Y10, Y9
+
+ // Store 1 outputs
+ VMOVDQU Y9, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x1Xor_end:
+ RET
+
+// func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, CX
+
+mulGFNI_9x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z19
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z20
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z20
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z20
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z20
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z20
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (R11), Z20
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z16, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z17, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 2 outputs
+ VMOVDQU64 Z18, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z19, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x2_64_loop
+ VZEROUPPER
+
+mulGFNI_9x2_64_end:
+ RET
+
+// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x2(SB), $0-88
+ // Loading 12 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, DX
+
+mulAvxGFNI_9x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x2_end:
+ RET
+
+// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R12
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+ ADDQ R14, R12
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, CX
+
+mulGFNI_9x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R13), Z18
+ VMOVDQU64 (R12), Z19
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z20
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z20
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z20
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z20
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z20
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z20
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z20
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (R11), Z20
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU64 (CX), Z20
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z16, Z20, Z21
+ VXORPD Z18, Z21, Z18
+ VGF2P8AFFINEQB $0x00, Z17, Z20, Z21
+ VXORPD Z19, Z21, Z19
+
+ // Store 2 outputs
+ VMOVDQU64 Z18, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z19, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88
+ // Loading 12 of 18 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 22 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, DX
+
+mulAvxGFNI_9x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R14), Y12
+ VMOVDQU (R13), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x2Xor_end:
+ RET
+
+// func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x3_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ VBROADCASTF32X2 208(CX), Z26
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, CX
+
+mulGFNI_9x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 3 outputs
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x3_64_loop
+ VZEROUPPER
+
+mulGFNI_9x3_64_end:
+ RET
+
+// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x3(SB), $8-88
+ // Loading 11 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, DX
+
+mulAvxGFNI_9x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x3_end:
+ RET
+
+// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x3_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ VBROADCASTF32X2 208(CX), Z26
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), CX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R12
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R13
+ ADDQ R15, R14
+ ADDQ R15, R12
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, CX
+
+mulGFNI_9x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R13), Z27
+ VMOVDQU64 (R14), Z28
+ VMOVDQU64 (R12), Z29
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU64 (CX), Z30
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 3 outputs
+ VMOVDQU64 Z27, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z28, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_9x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88
+ // Loading 11 of 27 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 32 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, DX
+
+mulAvxGFNI_9x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R13), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_9x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x3Xor_end:
+ RET
+
+// func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x4_64(SB), $8-88
+ // Loading 26 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_9x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 4 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_9x4_64_loop
+ VZEROUPPER
+
+mulGFNI_9x4_64_end:
+ RET
+
+// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x4(SB), $8-88
+ // Loading 10 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_9x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_9x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x4_end:
+ RET
+
+// func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x4_64Xor(SB), $8-88
+ // Loading 26 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_9x4_64Xor_loop:
+ // Load 4 outputs
+ VMOVDQU64 (R13), Z26
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R12), Z29
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 4 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ VMOVDQU64 Z26, (R13)
+ ADDQ $0x40, R13
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R12)
+ ADDQ $0x40, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_9x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88
+ // Loading 10 of 36 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 42 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), AX
+ MOVQ out_base+48(FP), R12
+ MOVQ out_base+48(FP), R12
+ MOVQ (R12), R13
+ MOVQ 24(R12), R14
+ MOVQ 48(R12), R15
+ MOVQ 72(R12), R12
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R13
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R12
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_9x4Xor_loop:
+ // Load 4 outputs
+ VMOVDQU (R13), Y10
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R12), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R12)
+ ADDQ $0x20, R12
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_9x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x4Xor_end:
+ RET
+
+// func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x5_64(SB), $0-88
+ // Loading 25 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x5_64_loop
+ VZEROUPPER
+
+mulGFNI_9x5_64_end:
+ RET
+
+// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x5(SB), $0-88
+ // Loading 9 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x5_end:
+ RET
+
+// func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x5_64Xor(SB), $0-88
+ // Loading 25 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x5_64Xor_loop:
+ // Load 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88
+ // Loading 9 of 45 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 52 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x5Xor_loop:
+ // Load 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x5Xor_end:
+ RET
+
+// func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x6_64(SB), $0-88
+ // Loading 24 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x6_64_loop
+ VZEROUPPER
+
+mulGFNI_9x6_64_end:
+ RET
+
+// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x6(SB), $0-88
+ // Loading 8 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x6_end:
+ RET
+
+// func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x6_64Xor(SB), $0-88
+ // Loading 24 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x6_64Xor_loop:
+ // Load 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88
+ // Loading 8 of 54 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 62 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x6Xor_loop:
+ // Load 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x6Xor_end:
+ RET
+
+// func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x7_64(SB), $0-88
+ // Loading 23 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x7_64_loop
+ VZEROUPPER
+
+mulGFNI_9x7_64_end:
+ RET
+
+// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x7(SB), $0-88
+ // Loading 7 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x7_end:
+ RET
+
+// func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x7_64Xor(SB), $0-88
+ // Loading 23 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x7_64Xor_loop:
+ // Load 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z23
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 144(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88
+ // Loading 7 of 63 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 72 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x7Xor_loop:
+ // Load 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x7Xor_end:
+ RET
+
+// func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x8_64(SB), $0-88
+ // Loading 22 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x8_64_loop
+ VZEROUPPER
+
+mulGFNI_9x8_64_end:
+ RET
+
+// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x8(SB), $0-88
+ // Loading 6 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x8_end:
+ RET
+
+// func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x8_64Xor(SB), $0-88
+ // Loading 22 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z22
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z23
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 144(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 168(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88
+ // Loading 6 of 72 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 82 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 168(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x8Xor_end:
+ RET
+
+// func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x9_64(SB), $0-88
+ // Loading 21 of 81 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z21, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x9_64_loop
+ VZEROUPPER
+
+mulGFNI_9x9_64_end:
+ RET
+
+// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x9(SB), $0-88
+ // Loading 5 of 81 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x9_end:
+ RET
+
+// func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x9_64Xor(SB), $0-88
+ // Loading 21 of 81 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z21
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z22
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z23
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 144(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 168(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 192(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z21, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88
+ // Loading 5 of 81 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 92 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y5
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 168(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 192(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x9Xor_end:
+ RET
+
+// func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x10_64(SB), $0-88
+ // Loading 20 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 102 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z20, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z21, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x10_64_loop
+ VZEROUPPER
+
+mulGFNI_9x10_64_end:
+ RET
+
+// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x10(SB), $0-88
+ // Loading 4 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 102 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x10_end:
+ RET
+
+// func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_9x10_64Xor(SB), $0-88
+ // Loading 20 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 102 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_9x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulGFNI_9x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 (R15)(R14*1), Z20
+ MOVQ 24(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z21
+ MOVQ 48(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z22
+ MOVQ 72(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z23
+ MOVQ 96(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z24
+ MOVQ 120(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z25
+ MOVQ 144(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z26
+ MOVQ 168(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z27
+ MOVQ 192(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z28
+ MOVQ 216(R13), R15
+ VMOVDQU64 (R15)(R14*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU64 Z20, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU64 Z21, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU64 Z22, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU64 Z23, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU64 Z24, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU64 Z25, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU64 Z26, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU64 Z27, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU64 Z28, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU64 Z29, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R14
+ DECQ AX
+ JNZ mulGFNI_9x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_9x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88
+ // Loading 4 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 102 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_9x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), DX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to input
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, DX
+
+mulAvxGFNI_9x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU (R15)(R14*1), Y4
+ MOVQ 24(R13), R15
+ VMOVDQU (R15)(R14*1), Y5
+ MOVQ 48(R13), R15
+ VMOVDQU (R15)(R14*1), Y6
+ MOVQ 72(R13), R15
+ VMOVDQU (R15)(R14*1), Y7
+ MOVQ 96(R13), R15
+ VMOVDQU (R15)(R14*1), Y8
+ MOVQ 120(R13), R15
+ VMOVDQU (R15)(R14*1), Y9
+ MOVQ 144(R13), R15
+ VMOVDQU (R15)(R14*1), Y10
+ MOVQ 168(R13), R15
+ VMOVDQU (R15)(R14*1), Y11
+ MOVQ 192(R13), R15
+ VMOVDQU (R15)(R14*1), Y12
+ MOVQ 216(R13), R15
+ VMOVDQU (R15)(R14*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R13), R15
+ VMOVDQU Y4, (R15)(R14*1)
+ MOVQ 24(R13), R15
+ VMOVDQU Y5, (R15)(R14*1)
+ MOVQ 48(R13), R15
+ VMOVDQU Y6, (R15)(R14*1)
+ MOVQ 72(R13), R15
+ VMOVDQU Y7, (R15)(R14*1)
+ MOVQ 96(R13), R15
+ VMOVDQU Y8, (R15)(R14*1)
+ MOVQ 120(R13), R15
+ VMOVDQU Y9, (R15)(R14*1)
+ MOVQ 144(R13), R15
+ VMOVDQU Y10, (R15)(R14*1)
+ MOVQ 168(R13), R15
+ VMOVDQU Y11, (R15)(R14*1)
+ MOVQ 192(R13), R15
+ VMOVDQU Y12, (R15)(R14*1)
+ MOVQ 216(R13), R15
+ VMOVDQU Y13, (R15)(R14*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R14
+ DECQ AX
+ JNZ mulAvxGFNI_9x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_9x10Xor_end:
+ RET
+
+// func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x1_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 13 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x1_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, CX
+
+mulGFNI_10x1_64_loop:
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z11
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z11, Z10
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z11
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z11
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z11
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z11
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z11
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z11
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (R11), Z11
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z7, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU64 (R12), Z11
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z8, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 9 to 1 outputs
+ VMOVDQU64 (CX), Z11
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Store 1 outputs
+ VMOVDQU64 Z10, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_10x1_64_loop
+ VZEROUPPER
+
+mulGFNI_10x1_64_end:
+ RET
+
+// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x1(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 13 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x1_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, CX
+
+mulAvxGFNI_10x1_loop:
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y11, Y10
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VGF2P8AFFINEQB $0x00, Y7, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 8 to 1 outputs
+ VMOVDQU (R12), Y11
+ ADDQ $0x20, R12
+ VGF2P8AFFINEQB $0x00, Y8, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 9 to 1 outputs
+ VMOVDQU (CX), Y11
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y9, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Store 1 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_10x1_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x1_end:
+ RET
+
+// func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x1_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 13 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x1_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, CX
+
+mulGFNI_10x1_64Xor_loop:
+ // Load 1 outputs
+ VMOVDQU64 (R13), Z10
+
+ // Load and process 64 bytes from input 0 to 1 outputs
+ VMOVDQU64 (DX), Z11
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 1 to 1 outputs
+ VMOVDQU64 (BX), Z11
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z1, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 2 to 1 outputs
+ VMOVDQU64 (SI), Z11
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z2, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 3 to 1 outputs
+ VMOVDQU64 (DI), Z11
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z3, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 4 to 1 outputs
+ VMOVDQU64 (R8), Z11
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z4, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 5 to 1 outputs
+ VMOVDQU64 (R9), Z11
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 6 to 1 outputs
+ VMOVDQU64 (R10), Z11
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z6, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 7 to 1 outputs
+ VMOVDQU64 (R11), Z11
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z7, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 8 to 1 outputs
+ VMOVDQU64 (R12), Z11
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z8, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Load and process 64 bytes from input 9 to 1 outputs
+ VMOVDQU64 (CX), Z11
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z9, Z11, Z11
+ VXORPD Z10, Z11, Z10
+
+ // Store 1 outputs
+ VMOVDQU64 Z10, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_10x1_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x1_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 13 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x1Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R13
+ MOVQ start+72(FP), R14
+
+ // Add start offset to output
+ ADDQ R14, R13
+
+ // Add start offset to input
+ ADDQ R14, DX
+ ADDQ R14, BX
+ ADDQ R14, SI
+ ADDQ R14, DI
+ ADDQ R14, R8
+ ADDQ R14, R9
+ ADDQ R14, R10
+ ADDQ R14, R11
+ ADDQ R14, R12
+ ADDQ R14, CX
+
+mulAvxGFNI_10x1Xor_loop:
+ // Load 1 outputs
+ VMOVDQU (R13), Y10
+
+ // Load and process 32 bytes from input 0 to 1 outputs
+ VMOVDQU (DX), Y11
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 1 to 1 outputs
+ VMOVDQU (BX), Y11
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y1, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 2 to 1 outputs
+ VMOVDQU (SI), Y11
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 3 to 1 outputs
+ VMOVDQU (DI), Y11
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y3, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 4 to 1 outputs
+ VMOVDQU (R8), Y11
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y4, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 5 to 1 outputs
+ VMOVDQU (R9), Y11
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 6 to 1 outputs
+ VMOVDQU (R10), Y11
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y6, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 7 to 1 outputs
+ VMOVDQU (R11), Y11
+ ADDQ $0x20, R11
+ VGF2P8AFFINEQB $0x00, Y7, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 8 to 1 outputs
+ VMOVDQU (R12), Y11
+ ADDQ $0x20, R12
+ VGF2P8AFFINEQB $0x00, Y8, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Load and process 32 bytes from input 9 to 1 outputs
+ VMOVDQU (CX), Y11
+ ADDQ $0x20, CX
+ VGF2P8AFFINEQB $0x00, Y9, Y11, Y11
+ VXORPD Y10, Y11, Y10
+
+ // Store 1 outputs
+ VMOVDQU Y10, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_10x1Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x1Xor_end:
+ RET
+
+// func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x2_64(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 24 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x2_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, CX
+
+mulGFNI_10x2_64_loop:
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z22
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z22, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z22, Z21
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z22
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z22
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z22
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z7, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z22
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z22
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z22
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (R11), Z22
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z14, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z15, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU64 (R12), Z22
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z16, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z17, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 9 to 2 outputs
+ VMOVDQU64 (CX), Z22
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z19, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Store 2 outputs
+ VMOVDQU64 Z20, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z21, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_10x2_64_loop
+ VZEROUPPER
+
+mulGFNI_10x2_64_end:
+ RET
+
+// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x2(SB), $8-88
+ // Loading 12 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 24 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x2_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ (R14), R15
+ MOVQ 24(R14), R14
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R15
+ ADDQ BP, R14
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, DX
+
+mulAvxGFNI_10x2_loop:
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 2 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R14)
+ ADDQ $0x20, R14
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_10x2_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x2_end:
+ RET
+
+// func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x2_64Xor(SB), $0-88
+ // Loading all tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 24 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x2_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), CX
+ MOVQ (CX), DX
+ MOVQ 24(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 72(CX), DI
+ MOVQ 96(CX), R8
+ MOVQ 120(CX), R9
+ MOVQ 144(CX), R10
+ MOVQ 168(CX), R11
+ MOVQ 192(CX), R12
+ MOVQ 216(CX), CX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R13
+ MOVQ start+72(FP), R15
+
+ // Add start offset to output
+ ADDQ R15, R14
+ ADDQ R15, R13
+
+ // Add start offset to input
+ ADDQ R15, DX
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, CX
+
+mulGFNI_10x2_64Xor_loop:
+ // Load 2 outputs
+ VMOVDQU64 (R14), Z20
+ VMOVDQU64 (R13), Z21
+
+ // Load and process 64 bytes from input 0 to 2 outputs
+ VMOVDQU64 (DX), Z22
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 1 to 2 outputs
+ VMOVDQU64 (BX), Z22
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z2, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z3, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 2 to 2 outputs
+ VMOVDQU64 (SI), Z22
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z5, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 3 to 2 outputs
+ VMOVDQU64 (DI), Z22
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z6, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z7, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 4 to 2 outputs
+ VMOVDQU64 (R8), Z22
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z8, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z9, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 5 to 2 outputs
+ VMOVDQU64 (R9), Z22
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z10, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 6 to 2 outputs
+ VMOVDQU64 (R10), Z22
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z12, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z13, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 7 to 2 outputs
+ VMOVDQU64 (R11), Z22
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z14, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z15, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 8 to 2 outputs
+ VMOVDQU64 (R12), Z22
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z16, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z17, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Load and process 64 bytes from input 9 to 2 outputs
+ VMOVDQU64 (CX), Z22
+ ADDQ $0x40, CX
+ VGF2P8AFFINEQB $0x00, Z18, Z22, Z23
+ VXORPD Z20, Z23, Z20
+ VGF2P8AFFINEQB $0x00, Z19, Z22, Z23
+ VXORPD Z21, Z23, Z21
+
+ // Store 2 outputs
+ VMOVDQU64 Z20, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z21, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulGFNI_10x2_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x2_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88
+ // Loading 12 of 20 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 24 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x2Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ VBROADCASTSD 88(CX), Y11
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ (R14), R15
+ MOVQ 24(R14), R14
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R15
+ ADDQ BP, R14
+
+ // Add start offset to input
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, R13
+ ADDQ BP, DX
+
+mulAvxGFNI_10x2Xor_loop:
+ // Load 2 outputs
+ VMOVDQU (R15), Y12
+ VMOVDQU (R14), Y13
+
+ // Load and process 32 bytes from input 0 to 2 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 2 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 2 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 2 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 2 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 2 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 2 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 2 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 2 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 2 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 2 outputs
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R14)
+ ADDQ $0x20, R14
+
+ // Prepare for next loop
+ DECQ AX
+ JNZ mulAvxGFNI_10x2Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x2Xor_end:
+ RET
+
+// func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x3_64(SB), $8-88
+ // Loading 27 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x3_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ VBROADCASTF32X2 208(CX), Z26
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_10x3_64_loop:
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 3 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 3 outputs
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_10x3_64_loop
+ VZEROUPPER
+
+mulGFNI_10x3_64_end:
+ RET
+
+// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x3(SB), $8-88
+ // Loading 11 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x3_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_10x3_loop:
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 3 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 3 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_10x3_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x3_end:
+ RET
+
+// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x3_64Xor(SB), $8-88
+ // Loading 27 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x3_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ VBROADCASTF32X2 208(CX), Z26
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x06, BP
+
+mulGFNI_10x3_64Xor_loop:
+ // Load 3 outputs
+ VMOVDQU64 (R14), Z27
+ VMOVDQU64 (R15), Z28
+ VMOVDQU64 (R13), Z29
+
+ // Load and process 64 bytes from input 0 to 3 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 3 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 3 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 3 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 3 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 3 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 3 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 3 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 3 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 3 outputs
+ VMOVDQU64 (AX), Z30
+ ADDQ $0x40, AX
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 3 outputs
+ VMOVDQU64 Z27, (R14)
+ ADDQ $0x40, R14
+ VMOVDQU64 Z28, (R15)
+ ADDQ $0x40, R15
+ VMOVDQU64 Z29, (R13)
+ ADDQ $0x40, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulGFNI_10x3_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x3_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88
+ // Loading 11 of 30 tables to registers
+ // Destination kept in GP registers
+ // Full registers estimated 35 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x3Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ VBROADCASTSD 80(CX), Y10
+ MOVQ in_base+24(FP), AX
+ MOVQ (AX), DX
+ MOVQ 24(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 72(AX), DI
+ MOVQ 96(AX), R8
+ MOVQ 120(AX), R9
+ MOVQ 144(AX), R10
+ MOVQ 168(AX), R11
+ MOVQ 192(AX), R12
+ MOVQ 216(AX), AX
+ MOVQ out_base+48(FP), R13
+ MOVQ out_base+48(FP), R13
+ MOVQ (R13), R14
+ MOVQ 24(R13), R15
+ MOVQ 48(R13), R13
+ MOVQ start+72(FP), BP
+
+ // Add start offset to output
+ ADDQ BP, R14
+ ADDQ BP, R15
+ ADDQ BP, R13
+
+ // Add start offset to input
+ ADDQ BP, DX
+ ADDQ BP, BX
+ ADDQ BP, SI
+ ADDQ BP, DI
+ ADDQ BP, R8
+ ADDQ BP, R9
+ ADDQ BP, R10
+ ADDQ BP, R11
+ ADDQ BP, R12
+ ADDQ BP, AX
+
+ // Reload length to save a register
+ MOVQ n+80(FP), BP
+ SHRQ $0x05, BP
+
+mulAvxGFNI_10x3Xor_loop:
+ // Load 3 outputs
+ VMOVDQU (R14), Y11
+ VMOVDQU (R15), Y12
+ VMOVDQU (R13), Y13
+
+ // Load and process 32 bytes from input 0 to 3 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 3 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 3 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 3 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 3 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 3 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 3 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 3 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 3 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 3 outputs
+ VMOVDQU (AX), Y14
+ ADDQ $0x20, AX
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 3 outputs
+ VMOVDQU Y11, (R14)
+ ADDQ $0x20, R14
+ VMOVDQU Y12, (R15)
+ ADDQ $0x20, R15
+ VMOVDQU Y13, (R13)
+ ADDQ $0x20, R13
+
+ // Prepare for next loop
+ DECQ BP
+ JNZ mulAvxGFNI_10x3Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x3Xor_end:
+ RET
+
+// func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x4_64(SB), $8-88
+ // Loading 26 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x4_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x4_64_loop:
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 4 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x4_64_loop
+ VZEROUPPER
+
+mulGFNI_10x4_64_end:
+ RET
+
+// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x4(SB), $8-88
+ // Loading 10 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x4_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x4_loop:
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x4_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x4_end:
+ RET
+
+// func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x4_64Xor(SB), $8-88
+ // Loading 26 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x4_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ VBROADCASTF32X2 200(CX), Z25
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x4_64Xor_loop:
+ // Load 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 4 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 4 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 4 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 4 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 4 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 4 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 4 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 4 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 4 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 4 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x4_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x4_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88
+ // Loading 10 of 40 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 46 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x4Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ VBROADCASTSD 72(CX), Y9
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x4Xor_loop:
+ // Load 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 4 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 4 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 4 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 4 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 4 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 4 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 4 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 4 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 4 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 4 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 4 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x4Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x4Xor_end:
+ RET
+
+// func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x5_64(SB), $8-88
+ // Loading 25 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x5_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x5_64_loop:
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 5 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x5_64_loop
+ VZEROUPPER
+
+mulGFNI_10x5_64_end:
+ RET
+
+// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x5(SB), $8-88
+ // Loading 9 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x5_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x5_loop:
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x5_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x5_end:
+ RET
+
+// func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x5_64Xor(SB), $8-88
+ // Loading 25 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x5_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ VBROADCASTF32X2 192(CX), Z24
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x5_64Xor_loop:
+ // Load 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 5 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 5 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 5 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 5 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 5 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 5 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 5 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 5 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 5 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 5 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x5_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x5_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88
+ // Loading 9 of 50 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 57 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x5Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ VBROADCASTSD 64(CX), Y8
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x5Xor_loop:
+ // Load 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 5 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 5 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 5 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 5 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 5 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 5 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 5 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 5 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 5 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 5 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 5 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x5Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x5Xor_end:
+ RET
+
+// func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x6_64(SB), $8-88
+ // Loading 24 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x6_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x6_64_loop:
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 6 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x6_64_loop
+ VZEROUPPER
+
+mulGFNI_10x6_64_end:
+ RET
+
+// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x6(SB), $8-88
+ // Loading 8 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x6_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x6_loop:
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x6_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x6_end:
+ RET
+
+// func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x6_64Xor(SB), $8-88
+ // Loading 24 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x6_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ VBROADCASTF32X2 184(CX), Z23
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x6_64Xor_loop:
+ // Load 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 6 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 6 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 6 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 6 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 6 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 6 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 6 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 6 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 6 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 6 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x6_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x6_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88
+ // Loading 8 of 60 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 68 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x6Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ VBROADCASTSD 56(CX), Y7
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x6Xor_loop:
+ // Load 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 6 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 6 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 6 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 6 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 6 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 6 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 6 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 6 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 6 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 6 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 6 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x6Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x6Xor_end:
+ RET
+
+// func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x7_64(SB), $8-88
+ // Loading 23 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 79 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x7_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x7_64_loop:
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 7 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x7_64_loop
+ VZEROUPPER
+
+mulGFNI_10x7_64_end:
+ RET
+
+// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x7(SB), $8-88
+ // Loading 7 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 79 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x7_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x7_loop:
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x7_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x7_end:
+ RET
+
+// func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x7_64Xor(SB), $8-88
+ // Loading 23 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 79 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x7_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ VBROADCASTF32X2 176(CX), Z22
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x7_64Xor_loop:
+ // Load 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z23
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 144(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 7 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 7 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 7 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 7 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 7 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 7 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 7 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 7 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 7 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 7 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x7_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x7_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88
+ // Loading 7 of 70 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 79 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x7Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ VBROADCASTSD 48(CX), Y6
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x7Xor_loop:
+ // Load 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 7 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 7 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 7 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 7 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 7 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 7 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 7 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 7 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 7 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 7 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 7 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x7Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x7Xor_end:
+ RET
+
+// func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x8_64(SB), $8-88
+ // Loading 22 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x8_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x8_64_loop:
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 8 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x8_64_loop
+ VZEROUPPER
+
+mulGFNI_10x8_64_end:
+ RET
+
+// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x8(SB), $8-88
+ // Loading 6 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x8_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x8_loop:
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
+ VBROADCASTSD 48(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 56(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x8_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x8_end:
+ RET
+
+// func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x8_64Xor(SB), $8-88
+ // Loading 22 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x8_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ VBROADCASTF32X2 168(CX), Z21
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x8_64Xor_loop:
+ // Load 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z22
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z23
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 144(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 168(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 8 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 8 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 8 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 8 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 8 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 8 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 8 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 8 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 8 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 8 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x8_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x8_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88
+ // Loading 6 of 80 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 90 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x8Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ VBROADCASTSD 40(CX), Y5
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x8Xor_loop:
+ // Load 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 168(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 8 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 8 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 8 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 8 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 8 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 8 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 8 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 8 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 8 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 8 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 8 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x8Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x8Xor_end:
+ RET
+
+// func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x9_64(SB), $8-88
+ // Loading 21 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 101 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x9_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x9_64_loop:
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 9 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z21, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x9_64_loop
+ VZEROUPPER
+
+mulGFNI_10x9_64_end:
+ RET
+
+// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x9(SB), $8-88
+ // Loading 5 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 101 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x9_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x9_loop:
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
+ VBROADCASTSD 40(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 48(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 56(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 64(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x9_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x9_end:
+ RET
+
+// func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x9_64Xor(SB), $8-88
+ // Loading 21 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 101 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x9_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ VBROADCASTF32X2 160(CX), Z20
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x9_64Xor_loop:
+ // Load 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z21
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z22
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z23
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 144(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 168(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 192(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 9 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 9 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 9 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 9 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 9 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 9 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 9 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 9 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 9 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 9 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z21, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x9_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x9_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88
+ // Loading 5 of 90 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 101 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x9Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ VBROADCASTSD 32(CX), Y4
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x9Xor_loop:
+ // Load 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y5
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 168(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 192(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 9 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 9 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 9 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 9 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 9 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 9 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 9 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 9 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 9 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 9 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 9 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x9Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x9Xor_end:
+ RET
+
+// func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x10_64(SB), $8-88
+ // Loading 20 of 100 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 112 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x10_64_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x10_64_loop:
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 10 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z20, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z21, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x10_64_loop
+ VZEROUPPER
+
+mulGFNI_10x10_64_end:
+ RET
+
+// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x10(SB), $8-88
+ // Loading 4 of 100 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 112 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x10_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x10_loop:
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
+ VBROADCASTSD 32(CX), Y8
+ VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
+ VBROADCASTSD 40(CX), Y9
+ VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
+ VBROADCASTSD 48(CX), Y10
+ VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
+ VBROADCASTSD 56(CX), Y11
+ VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
+ VBROADCASTSD 64(CX), Y12
+ VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
+ VBROADCASTSD 72(CX), Y13
+ VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 720(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 728(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 736(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 744(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 752(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 760(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 768(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 776(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 784(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 792(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x10_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x10_end:
+ RET
+
+// func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·mulGFNI_10x10_64Xor(SB), $8-88
+ // Loading 20 of 100 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 112 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x06, AX
+ TESTQ AX, AX
+ JZ mulGFNI_10x10_64Xor_end
+ VBROADCASTF32X2 (CX), Z0
+ VBROADCASTF32X2 8(CX), Z1
+ VBROADCASTF32X2 16(CX), Z2
+ VBROADCASTF32X2 24(CX), Z3
+ VBROADCASTF32X2 32(CX), Z4
+ VBROADCASTF32X2 40(CX), Z5
+ VBROADCASTF32X2 48(CX), Z6
+ VBROADCASTF32X2 56(CX), Z7
+ VBROADCASTF32X2 64(CX), Z8
+ VBROADCASTF32X2 72(CX), Z9
+ VBROADCASTF32X2 80(CX), Z10
+ VBROADCASTF32X2 88(CX), Z11
+ VBROADCASTF32X2 96(CX), Z12
+ VBROADCASTF32X2 104(CX), Z13
+ VBROADCASTF32X2 112(CX), Z14
+ VBROADCASTF32X2 120(CX), Z15
+ VBROADCASTF32X2 128(CX), Z16
+ VBROADCASTF32X2 136(CX), Z17
+ VBROADCASTF32X2 144(CX), Z18
+ VBROADCASTF32X2 152(CX), Z19
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulGFNI_10x10_64Xor_loop:
+ // Load 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 (BP)(R15*1), Z20
+ MOVQ 24(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z21
+ MOVQ 48(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z22
+ MOVQ 72(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z23
+ MOVQ 96(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z24
+ MOVQ 120(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z25
+ MOVQ 144(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z26
+ MOVQ 168(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z27
+ MOVQ 192(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z28
+ MOVQ 216(R14), BP
+ VMOVDQU64 (BP)(R15*1), Z29
+
+ // Load and process 64 bytes from input 0 to 10 outputs
+ VMOVDQU64 (BX), Z30
+ ADDQ $0x40, BX
+ VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 1 to 10 outputs
+ VMOVDQU64 (SI), Z30
+ ADDQ $0x40, SI
+ VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 2 to 10 outputs
+ VMOVDQU64 (DI), Z30
+ ADDQ $0x40, DI
+ VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 3 to 10 outputs
+ VMOVDQU64 (R8), Z30
+ ADDQ $0x40, R8
+ VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 4 to 10 outputs
+ VMOVDQU64 (R9), Z30
+ ADDQ $0x40, R9
+ VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 5 to 10 outputs
+ VMOVDQU64 (R10), Z30
+ ADDQ $0x40, R10
+ VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 6 to 10 outputs
+ VMOVDQU64 (R11), Z30
+ ADDQ $0x40, R11
+ VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 7 to 10 outputs
+ VMOVDQU64 (R12), Z30
+ ADDQ $0x40, R12
+ VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 8 to 10 outputs
+ VMOVDQU64 (R13), Z30
+ ADDQ $0x40, R13
+ VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Load and process 64 bytes from input 9 to 10 outputs
+ VMOVDQU64 (DX), Z30
+ ADDQ $0x40, DX
+ VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31
+ VXORPD Z20, Z31, Z20
+ VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31
+ VXORPD Z21, Z31, Z21
+ VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31
+ VXORPD Z22, Z31, Z22
+ VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31
+ VXORPD Z23, Z31, Z23
+ VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31
+ VXORPD Z24, Z31, Z24
+ VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31
+ VXORPD Z25, Z31, Z25
+ VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31
+ VXORPD Z26, Z31, Z26
+ VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31
+ VXORPD Z27, Z31, Z27
+ VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31
+ VXORPD Z28, Z31, Z28
+ VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31
+ VXORPD Z29, Z31, Z29
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU64 Z20, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU64 Z21, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU64 Z22, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU64 Z23, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU64 Z24, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU64 Z25, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU64 Z26, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU64 Z27, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU64 Z28, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU64 Z29, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x40, R15
+ DECQ AX
+ JNZ mulGFNI_10x10_64Xor_loop
+ VZEROUPPER
+
+mulGFNI_10x10_64Xor_end:
+ RET
+
+// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
+// Requires: AVX, GFNI
+TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88
+ // Loading 4 of 100 tables to registers
+ // Destination kept on stack
+ // Full registers estimated 112 YMM used
+ MOVQ n+80(FP), AX
+ MOVQ matrix_base+0(FP), CX
+ SHRQ $0x05, AX
+ TESTQ AX, AX
+ JZ mulAvxGFNI_10x10Xor_end
+ VBROADCASTSD (CX), Y0
+ VBROADCASTSD 8(CX), Y1
+ VBROADCASTSD 16(CX), Y2
+ VBROADCASTSD 24(CX), Y3
+ MOVQ in_base+24(FP), DX
+ MOVQ (DX), BX
+ MOVQ 24(DX), SI
+ MOVQ 48(DX), DI
+ MOVQ 72(DX), R8
+ MOVQ 96(DX), R9
+ MOVQ 120(DX), R10
+ MOVQ 144(DX), R11
+ MOVQ 168(DX), R12
+ MOVQ 192(DX), R13
+ MOVQ 216(DX), DX
+ MOVQ out_base+48(FP), R14
+ MOVQ out_base+48(FP), R14
+ MOVQ start+72(FP), R15
+
+ // Add start offset to input
+ ADDQ R15, BX
+ ADDQ R15, SI
+ ADDQ R15, DI
+ ADDQ R15, R8
+ ADDQ R15, R9
+ ADDQ R15, R10
+ ADDQ R15, R11
+ ADDQ R15, R12
+ ADDQ R15, R13
+ ADDQ R15, DX
+
+mulAvxGFNI_10x10Xor_loop:
+ // Load 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU (BP)(R15*1), Y4
+ MOVQ 24(R14), BP
+ VMOVDQU (BP)(R15*1), Y5
+ MOVQ 48(R14), BP
+ VMOVDQU (BP)(R15*1), Y6
+ MOVQ 72(R14), BP
+ VMOVDQU (BP)(R15*1), Y7
+ MOVQ 96(R14), BP
+ VMOVDQU (BP)(R15*1), Y8
+ MOVQ 120(R14), BP
+ VMOVDQU (BP)(R15*1), Y9
+ MOVQ 144(R14), BP
+ VMOVDQU (BP)(R15*1), Y10
+ MOVQ 168(R14), BP
+ VMOVDQU (BP)(R15*1), Y11
+ MOVQ 192(R14), BP
+ VMOVDQU (BP)(R15*1), Y12
+ MOVQ 216(R14), BP
+ VMOVDQU (BP)(R15*1), Y13
+
+ // Load and process 32 bytes from input 0 to 10 outputs
+ VMOVDQU (BX), Y14
+ ADDQ $0x20, BX
+ VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 32(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 40(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 48(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 56(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 64(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 72(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 1 to 10 outputs
+ VMOVDQU (SI), Y14
+ ADDQ $0x20, SI
+ VBROADCASTSD 80(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 88(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 96(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 104(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 112(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 120(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 128(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 136(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 144(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 152(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 2 to 10 outputs
+ VMOVDQU (DI), Y14
+ ADDQ $0x20, DI
+ VBROADCASTSD 160(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 168(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 176(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 184(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 192(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 200(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 208(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 216(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 224(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 232(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 3 to 10 outputs
+ VMOVDQU (R8), Y14
+ ADDQ $0x20, R8
+ VBROADCASTSD 240(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 248(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 256(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 264(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 272(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 280(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 288(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 296(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 304(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 312(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 4 to 10 outputs
+ VMOVDQU (R9), Y14
+ ADDQ $0x20, R9
+ VBROADCASTSD 320(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 328(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 336(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 344(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 352(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 360(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 368(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 376(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 384(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 392(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 5 to 10 outputs
+ VMOVDQU (R10), Y14
+ ADDQ $0x20, R10
+ VBROADCASTSD 400(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 408(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 416(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 424(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 432(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 440(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 448(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 456(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 464(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 472(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 6 to 10 outputs
+ VMOVDQU (R11), Y14
+ ADDQ $0x20, R11
+ VBROADCASTSD 480(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 488(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 496(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 504(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 512(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 520(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 528(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 536(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 544(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 552(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 7 to 10 outputs
+ VMOVDQU (R12), Y14
+ ADDQ $0x20, R12
+ VBROADCASTSD 560(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 568(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 576(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 584(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 592(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 600(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 608(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 616(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 624(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 632(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 8 to 10 outputs
+ VMOVDQU (R13), Y14
+ ADDQ $0x20, R13
+ VBROADCASTSD 640(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 648(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 656(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 664(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 672(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 680(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 688(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 696(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 704(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 712(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Load and process 32 bytes from input 9 to 10 outputs
+ VMOVDQU (DX), Y14
+ ADDQ $0x20, DX
+ VBROADCASTSD 720(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y4, Y15, Y4
+ VBROADCASTSD 728(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y5, Y15, Y5
+ VBROADCASTSD 736(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y6, Y15, Y6
+ VBROADCASTSD 744(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y7, Y15, Y7
+ VBROADCASTSD 752(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y8, Y15, Y8
+ VBROADCASTSD 760(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y9, Y15, Y9
+ VBROADCASTSD 768(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y10, Y15, Y10
+ VBROADCASTSD 776(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y11, Y15, Y11
+ VBROADCASTSD 784(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y12, Y15, Y12
+ VBROADCASTSD 792(CX), Y15
+ VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
+ VXORPD Y13, Y15, Y13
+
+ // Store 10 outputs
+ MOVQ (R14), BP
+ VMOVDQU Y4, (BP)(R15*1)
+ MOVQ 24(R14), BP
+ VMOVDQU Y5, (BP)(R15*1)
+ MOVQ 48(R14), BP
+ VMOVDQU Y6, (BP)(R15*1)
+ MOVQ 72(R14), BP
+ VMOVDQU Y7, (BP)(R15*1)
+ MOVQ 96(R14), BP
+ VMOVDQU Y8, (BP)(R15*1)
+ MOVQ 120(R14), BP
+ VMOVDQU Y9, (BP)(R15*1)
+ MOVQ 144(R14), BP
+ VMOVDQU Y10, (BP)(R15*1)
+ MOVQ 168(R14), BP
+ VMOVDQU Y11, (BP)(R15*1)
+ MOVQ 192(R14), BP
+ VMOVDQU Y12, (BP)(R15*1)
+ MOVQ 216(R14), BP
+ VMOVDQU Y13, (BP)(R15*1)
+
+ // Prepare for next loop
+ ADDQ $0x20, R15
+ DECQ AX
+ JNZ mulAvxGFNI_10x10Xor_loop
+ VZEROUPPER
+
+mulAvxGFNI_10x10Xor_end:
+ RET
+
+// func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t23+40(FP), Z1
+ VBROADCASTF32X2 t02+48(FP), Z2
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z3
+ VMOVDQU64 (DI), Z4
+ VMOVDQU64 (R8), Z5
+ VMOVDQU64 (AX), Z6
+ VXORPD Z4, Z3, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z7
+ VXORPD Z3, Z7, Z3
+ VXORPD Z5, Z6, Z6
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
+ VPTERNLOGD $0x96, Z7, Z3, Z5
+ VXORPD Z4, Z6, Z6
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z7
+ VXORPD Z3, Z7, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VMOVDQU64 Z3, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z4, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z5, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z6, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t23+40(FP), Z1
+ VBROADCASTF32X2 t02+48(FP), Z2
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z3
+ VMOVDQU64 (DI), Z4
+ VMOVDQU64 (R8), Z5
+ VMOVDQU64 (AX), Z6
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z2, Z5, Z7
+ VXORPD Z3, Z7, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
+ VXORPD Z4, Z7, Z4
+ VXORPD Z3, Z5, Z5
+ VXORPD Z4, Z6, Z6
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z7
+ VXORPD Z3, Z7, Z3
+ VXORPD Z4, Z3, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
+ VXORPD Z5, Z7, Z5
+ VXORPD Z5, Z6, Z6
+ VMOVDQU64 Z3, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z4, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z5, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z6, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t23+40(FP), Z0
+ VBROADCASTF32X2 t02+48(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+ VXORPD Z3, Z2, Z3
+ VXORPD Z4, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z6
+ VPTERNLOGD $0x96, Z6, Z2, Z4
+ VXORPD Z3, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
+ VXORPD Z2, Z6, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z3, Z6, Z3
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t23+40(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+ VXORPD Z2, Z4, Z4
+ VXORPD Z3, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
+ VXORPD Z2, Z6, Z2
+ VXORPD Z3, Z2, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z4, Z6, Z4
+ VXORPD Z4, Z5, Z5
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t02+48(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+ VXORPD Z3, Z2, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
+ VXORPD Z2, Z6, Z2
+ VXORPD Z4, Z5, Z5
+ VXORPD Z2, Z4, Z4
+ VXORPD Z3, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
+ VXORPD Z2, Z6, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z3, Z6, Z3
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t23+40(FP), Z0
+ VBROADCASTF32X2 t02+48(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
+ VXORPD Z2, Z6, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z3, Z6, Z3
+ VXORPD Z2, Z4, Z4
+ VXORPD Z3, Z5, Z5
+ VXORPD Z3, Z2, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z5, Z6
+ VXORPD Z4, Z6, Z4
+ VXORPD Z4, Z5, Z5
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t02+48(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z2, Z1, Z2
+ VXORPD Z3, Z4, Z4
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z5
+ VXORPD Z1, Z5, Z1
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VXORPD Z2, Z5, Z2
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_3(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t23+40(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+ VXORPD Z2, Z1, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VXORPD Z3, Z5, Z3
+ VXORPD Z3, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t23+40(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+ VXORPD Z3, Z2, Z3
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
+ VXORPD Z2, Z6, Z2
+ VXORPD Z4, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VPTERNLOGD $0x96, Z6, Z2, Z4
+ VXORPD Z3, Z5, Z5
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ VBROADCASTF32X2 t02+48(FP), Z1
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z2
+ VMOVDQU64 (DI), Z3
+ VMOVDQU64 (R8), Z4
+ VMOVDQU64 (AX), Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
+ VXORPD Z2, Z6, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
+ VXORPD Z3, Z6, Z3
+ VXORPD Z2, Z4, Z4
+ VXORPD Z3, Z5, Z5
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
+ VXORPD Z2, Z6, Z2
+ VXORPD Z3, Z2, Z3
+ VXORPD Z4, Z5, Z5
+ VMOVDQU64 Z2, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z3, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z4, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z5, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t23+40(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z2, Z1, Z2
+ VXORPD Z3, Z4, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VPTERNLOGD $0x96, Z5, Z1, Z3
+ VXORPD Z2, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z2, Z5
+ VXORPD Z1, Z5, Z1
+ VXORPD Z2, Z1, Z2
+ VXORPD Z3, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t01+32(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+ VXORPD Z2, Z1, Z2
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z2, Z5
+ VXORPD Z1, Z5, Z1
+ VXORPD Z3, Z4, Z4
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F, GFNI
+TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56
+ VBROADCASTF32X2 t02+48(FP), Z0
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z1
+ VMOVDQU64 (DI), Z2
+ VMOVDQU64 (R8), Z3
+ VMOVDQU64 (AX), Z4
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z3, Z5
+ VXORPD Z1, Z5, Z1
+
+ // LEO_MULADD_512
+ VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
+ VXORPD Z2, Z5, Z2
+ VXORPD Z1, Z3, Z3
+ VXORPD Z2, Z4, Z4
+ VXORPD Z2, Z1, Z2
+ VXORPD Z3, Z4, Z4
+ VMOVDQU64 Z1, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z2, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z3, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z4, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F
+TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z0
+ VMOVDQU64 (DI), Z1
+ VMOVDQU64 (R8), Z2
+ VMOVDQU64 (AX), Z3
+ VXORPD Z1, Z0, Z1
+ VXORPD Z2, Z3, Z3
+ VXORPD Z0, Z2, Z2
+ VXORPD Z1, Z3, Z3
+ VMOVDQU64 Z0, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z1, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z2, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z3, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
+
+// func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
+// Requires: AVX, AVX512DQ, AVX512F
+TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56
+ MOVQ dist+24(FP), AX
+ MOVQ work_base+0(FP), CX
+ MOVQ 8(CX), DX
+ XORQ BX, BX
+ MOVQ (CX)(BX*1), SI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), DI
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), R8
+ ADDQ AX, BX
+ MOVQ (CX)(BX*1), AX
+
+loop:
+ VMOVDQU64 (SI), Z0
+ VMOVDQU64 (DI), Z1
+ VMOVDQU64 (R8), Z2
+ VMOVDQU64 (AX), Z3
+ VXORPD Z0, Z2, Z2
+ VXORPD Z1, Z3, Z3
+ VXORPD Z1, Z0, Z1
+ VXORPD Z2, Z3, Z3
+ VMOVDQU64 Z0, (SI)
+ ADDQ $0x40, SI
+ VMOVDQU64 Z1, (DI)
+ ADDQ $0x40, DI
+ VMOVDQU64 Z2, (R8)
+ ADDQ $0x40, R8
+ VMOVDQU64 Z3, (AX)
+ ADDQ $0x40, AX
+ SUBQ $0x40, DX
+ JA loop
+ VZEROUPPER
+ RET
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go
new file mode 100644
index 000000000..429e2c20d
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go
@@ -0,0 +1,2045 @@
+// Code generated by command: go generate gen.go. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !nogen && !nopshufb
+// +build !appengine,!noasm,gc,!nogen,!nopshufb
+
+package reedsolomon
+
+import (
+ "fmt"
+)
+
+const (
+ avx2CodeGen = true
+ maxAvx2Inputs = 10
+ maxAvx2Outputs = 10
+ minAvx2Size = 64
+)
+
+func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
+ n := stop - start
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_1x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_1x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_1x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_1x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_1x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_1x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_1x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_1x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_1x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_1x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_2x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_2x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_2x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_2x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_2x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_2x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_2x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_2x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_2x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_2x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_3x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_3x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_3x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_3x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_3x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_3x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_3x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_3x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_3x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_3x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_4x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_4x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_4x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_4x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_4x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_4x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_4x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_4x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_4x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_4x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_5x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_5x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_5x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_5x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_5x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_5x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_5x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_5x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_5x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_5x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_6x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_6x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_6x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_6x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_6x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_6x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_6x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_6x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_6x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_6x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_7x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_7x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_7x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_7x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_7x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_7x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_7x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_7x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_7x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_7x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_8x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_8x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_8x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_8x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_8x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_8x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_8x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_8x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_8x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_8x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_9x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_9x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_9x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_9x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_9x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_9x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_9x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_9x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_9x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_9x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_10x1_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_10x2_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_10x3_64(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_10x4(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_10x5(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_10x6(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_10x7(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_10x8(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_10x9(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_10x10(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
+
+func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
+ n := (stop - start)
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_1x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_1x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_1x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_1x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_1x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_1x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_1x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_1x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_1x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_1x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_2x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_2x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_2x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_2x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_2x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_2x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_2x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_2x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_2x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_2x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_3x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_3x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_3x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_3x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_3x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_3x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_3x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_3x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_3x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_3x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_4x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_4x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_4x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_4x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_4x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_4x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_4x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_4x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_4x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_4x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_5x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_5x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_5x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_5x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_5x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_5x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_5x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_5x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_5x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_5x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_6x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_6x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_6x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_6x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_6x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_6x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_6x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_6x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_6x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_6x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_7x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_7x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_7x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_7x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_7x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_7x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_7x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_7x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_7x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_7x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_8x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_8x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_8x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_8x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_8x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_8x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_8x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_8x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_8x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_8x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_9x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_9x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_9x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_9x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_9x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_9x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_9x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_9x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_9x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_9x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulAvxTwo_10x1_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 2:
+ mulAvxTwo_10x2_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 3:
+ mulAvxTwo_10x3_64Xor(matrix, in, out, start, n)
+ return n & (maxInt - 63)
+ case 4:
+ mulAvxTwo_10x4Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 5:
+ mulAvxTwo_10x5Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 6:
+ mulAvxTwo_10x6Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 7:
+ mulAvxTwo_10x7Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 8:
+ mulAvxTwo_10x8Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 9:
+ mulAvxTwo_10x9Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ case 10:
+ mulAvxTwo_10x10Xor(matrix, in, out, start, n)
+ return n & (maxInt - 31)
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
+
+func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
+ n := (stop - start) & (maxInt - (64 - 1))
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulGFNI_1x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_1x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_1x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_1x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_1x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_1x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_1x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_1x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_1x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_1x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulGFNI_2x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_2x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_2x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_2x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_2x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_2x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_2x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_2x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_2x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_2x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulGFNI_3x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_3x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_3x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_3x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_3x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_3x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_3x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_3x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_3x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_3x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulGFNI_4x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_4x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_4x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_4x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_4x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_4x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_4x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_4x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_4x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_4x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulGFNI_5x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_5x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_5x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_5x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_5x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_5x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_5x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_5x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_5x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_5x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulGFNI_6x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_6x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_6x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_6x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_6x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_6x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_6x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_6x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_6x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_6x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulGFNI_7x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_7x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_7x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_7x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_7x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_7x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_7x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_7x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_7x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_7x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulGFNI_8x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_8x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_8x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_8x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_8x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_8x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_8x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_8x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_8x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_8x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulGFNI_9x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_9x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_9x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_9x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_9x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_9x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_9x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_9x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_9x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_9x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulGFNI_10x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_10x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_10x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_10x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_10x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_10x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_10x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_10x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_10x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_10x10_64(matrix, in, out, start, n)
+ return n
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
+
+func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
+ n := (stop - start) & (maxInt - (64 - 1))
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulGFNI_1x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_1x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_1x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_1x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_1x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_1x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_1x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_1x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_1x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_1x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulGFNI_2x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_2x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_2x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_2x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_2x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_2x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_2x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_2x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_2x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_2x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulGFNI_3x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_3x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_3x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_3x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_3x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_3x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_3x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_3x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_3x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_3x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulGFNI_4x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_4x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_4x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_4x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_4x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_4x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_4x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_4x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_4x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_4x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulGFNI_5x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_5x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_5x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_5x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_5x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_5x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_5x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_5x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_5x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_5x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulGFNI_6x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_6x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_6x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_6x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_6x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_6x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_6x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_6x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_6x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_6x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulGFNI_7x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_7x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_7x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_7x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_7x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_7x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_7x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_7x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_7x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_7x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulGFNI_8x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_8x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_8x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_8x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_8x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_8x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_8x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_8x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_8x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_8x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulGFNI_9x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_9x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_9x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_9x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_9x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_9x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_9x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_9x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_9x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_9x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulGFNI_10x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_10x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_10x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_10x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_10x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_10x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_10x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_10x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_10x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_10x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
+
+func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
+ n := (stop - start) & (maxInt - (32 - 1))
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_1x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_1x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_1x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_1x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_1x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_1x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_1x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_1x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_1x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_1x10(matrix, in, out, start, n)
+ return n
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_2x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_2x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_2x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_2x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_2x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_2x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_2x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_2x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_2x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_2x10(matrix, in, out, start, n)
+ return n
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_3x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_3x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_3x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_3x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_3x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_3x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_3x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_3x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_3x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_3x10(matrix, in, out, start, n)
+ return n
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_4x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_4x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_4x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_4x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_4x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_4x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_4x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_4x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_4x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_4x10(matrix, in, out, start, n)
+ return n
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_5x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_5x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_5x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_5x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_5x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_5x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_5x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_5x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_5x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_5x10(matrix, in, out, start, n)
+ return n
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_6x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_6x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_6x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_6x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_6x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_6x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_6x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_6x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_6x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_6x10(matrix, in, out, start, n)
+ return n
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_7x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_7x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_7x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_7x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_7x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_7x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_7x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_7x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_7x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_7x10(matrix, in, out, start, n)
+ return n
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_8x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_8x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_8x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_8x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_8x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_8x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_8x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_8x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_8x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_8x10(matrix, in, out, start, n)
+ return n
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_9x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_9x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_9x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_9x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_9x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_9x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_9x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_9x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_9x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_9x10(matrix, in, out, start, n)
+ return n
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_10x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_10x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_10x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_10x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_10x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_10x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_10x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_10x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_10x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_10x10(matrix, in, out, start, n)
+ return n
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
+
+func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
+ n := (stop - start) & (maxInt - (32 - 1))
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_1x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_1x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_1x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_1x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_1x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_1x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_1x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_1x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_1x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_1x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_2x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_2x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_2x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_2x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_2x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_2x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_2x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_2x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_2x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_2x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_3x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_3x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_3x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_3x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_3x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_3x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_3x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_3x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_3x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_3x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_4x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_4x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_4x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_4x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_4x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_4x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_4x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_4x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_4x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_4x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_5x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_5x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_5x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_5x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_5x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_5x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_5x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_5x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_5x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_5x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_6x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_6x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_6x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_6x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_6x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_6x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_6x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_6x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_6x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_6x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_7x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_7x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_7x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_7x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_7x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_7x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_7x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_7x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_7x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_7x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_8x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_8x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_8x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_8x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_8x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_8x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_8x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_8x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_8x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_8x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_9x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_9x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_9x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_9x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_9x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_9x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_9x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_9x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_9x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_9x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_10x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_10x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_10x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_10x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_10x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_10x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_10x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_10x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_10x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_10x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go
new file mode 100644
index 000000000..1ba08b5e1
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go
@@ -0,0 +1,1372 @@
+// Code generated by command: go generate gen.go. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !nogen && nopshufb
+// +build !appengine,!noasm,gc,!nogen,nopshufb
+
+package reedsolomon
+
+import (
+ "fmt"
+)
+
+const (
+ avx2CodeGen = true
+ maxAvx2Inputs = 10
+ maxAvx2Outputs = 10
+ minAvx2Size = 64
+)
+
+func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) }
+func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) }
+
+func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
+ n := (stop - start) & (maxInt - (64 - 1))
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulGFNI_1x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_1x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_1x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_1x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_1x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_1x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_1x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_1x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_1x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_1x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulGFNI_2x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_2x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_2x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_2x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_2x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_2x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_2x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_2x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_2x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_2x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulGFNI_3x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_3x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_3x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_3x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_3x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_3x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_3x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_3x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_3x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_3x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulGFNI_4x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_4x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_4x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_4x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_4x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_4x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_4x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_4x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_4x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_4x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulGFNI_5x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_5x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_5x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_5x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_5x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_5x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_5x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_5x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_5x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_5x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulGFNI_6x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_6x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_6x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_6x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_6x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_6x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_6x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_6x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_6x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_6x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulGFNI_7x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_7x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_7x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_7x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_7x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_7x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_7x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_7x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_7x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_7x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulGFNI_8x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_8x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_8x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_8x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_8x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_8x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_8x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_8x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_8x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_8x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulGFNI_9x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_9x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_9x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_9x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_9x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_9x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_9x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_9x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_9x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_9x10_64(matrix, in, out, start, n)
+ return n
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulGFNI_10x1_64(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_10x2_64(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_10x3_64(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_10x4_64(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_10x5_64(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_10x6_64(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_10x7_64(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_10x8_64(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_10x9_64(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_10x10_64(matrix, in, out, start, n)
+ return n
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
+
+func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
+ n := (stop - start) & (maxInt - (64 - 1))
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulGFNI_1x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_1x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_1x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_1x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_1x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_1x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_1x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_1x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_1x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_1x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulGFNI_2x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_2x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_2x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_2x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_2x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_2x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_2x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_2x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_2x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_2x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulGFNI_3x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_3x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_3x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_3x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_3x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_3x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_3x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_3x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_3x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_3x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulGFNI_4x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_4x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_4x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_4x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_4x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_4x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_4x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_4x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_4x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_4x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulGFNI_5x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_5x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_5x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_5x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_5x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_5x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_5x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_5x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_5x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_5x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulGFNI_6x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_6x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_6x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_6x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_6x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_6x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_6x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_6x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_6x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_6x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulGFNI_7x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_7x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_7x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_7x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_7x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_7x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_7x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_7x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_7x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_7x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulGFNI_8x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_8x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_8x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_8x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_8x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_8x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_8x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_8x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_8x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_8x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulGFNI_9x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_9x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_9x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_9x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_9x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_9x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_9x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_9x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_9x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_9x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulGFNI_10x1_64Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulGFNI_10x2_64Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulGFNI_10x3_64Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulGFNI_10x4_64Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulGFNI_10x5_64Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulGFNI_10x6_64Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulGFNI_10x7_64Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulGFNI_10x8_64Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulGFNI_10x9_64Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulGFNI_10x10_64Xor(matrix, in, out, start, n)
+ return n
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
+
+func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
+ n := (stop - start) & (maxInt - (32 - 1))
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_1x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_1x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_1x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_1x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_1x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_1x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_1x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_1x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_1x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_1x10(matrix, in, out, start, n)
+ return n
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_2x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_2x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_2x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_2x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_2x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_2x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_2x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_2x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_2x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_2x10(matrix, in, out, start, n)
+ return n
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_3x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_3x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_3x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_3x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_3x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_3x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_3x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_3x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_3x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_3x10(matrix, in, out, start, n)
+ return n
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_4x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_4x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_4x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_4x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_4x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_4x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_4x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_4x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_4x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_4x10(matrix, in, out, start, n)
+ return n
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_5x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_5x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_5x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_5x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_5x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_5x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_5x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_5x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_5x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_5x10(matrix, in, out, start, n)
+ return n
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_6x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_6x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_6x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_6x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_6x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_6x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_6x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_6x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_6x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_6x10(matrix, in, out, start, n)
+ return n
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_7x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_7x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_7x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_7x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_7x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_7x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_7x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_7x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_7x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_7x10(matrix, in, out, start, n)
+ return n
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_8x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_8x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_8x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_8x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_8x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_8x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_8x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_8x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_8x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_8x10(matrix, in, out, start, n)
+ return n
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_9x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_9x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_9x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_9x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_9x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_9x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_9x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_9x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_9x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_9x10(matrix, in, out, start, n)
+ return n
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_10x1(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_10x2(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_10x3(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_10x4(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_10x5(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_10x6(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_10x7(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_10x8(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_10x9(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_10x10(matrix, in, out, start, n)
+ return n
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
+
+func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
+ n := (stop - start) & (maxInt - (32 - 1))
+
+ switch len(in) {
+ case 1:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_1x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_1x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_1x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_1x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_1x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_1x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_1x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_1x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_1x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_1x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 2:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_2x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_2x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_2x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_2x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_2x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_2x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_2x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_2x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_2x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_2x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 3:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_3x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_3x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_3x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_3x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_3x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_3x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_3x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_3x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_3x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_3x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 4:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_4x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_4x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_4x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_4x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_4x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_4x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_4x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_4x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_4x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_4x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 5:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_5x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_5x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_5x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_5x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_5x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_5x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_5x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_5x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_5x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_5x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 6:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_6x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_6x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_6x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_6x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_6x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_6x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_6x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_6x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_6x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_6x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 7:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_7x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_7x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_7x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_7x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_7x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_7x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_7x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_7x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_7x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_7x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 8:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_8x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_8x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_8x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_8x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_8x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_8x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_8x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_8x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_8x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_8x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 9:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_9x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_9x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_9x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_9x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_9x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_9x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_9x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_9x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_9x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_9x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ case 10:
+ switch len(out) {
+ case 1:
+ mulAvxGFNI_10x1Xor(matrix, in, out, start, n)
+ return n
+ case 2:
+ mulAvxGFNI_10x2Xor(matrix, in, out, start, n)
+ return n
+ case 3:
+ mulAvxGFNI_10x3Xor(matrix, in, out, start, n)
+ return n
+ case 4:
+ mulAvxGFNI_10x4Xor(matrix, in, out, start, n)
+ return n
+ case 5:
+ mulAvxGFNI_10x5Xor(matrix, in, out, start, n)
+ return n
+ case 6:
+ mulAvxGFNI_10x6Xor(matrix, in, out, start, n)
+ return n
+ case 7:
+ mulAvxGFNI_10x7Xor(matrix, in, out, start, n)
+ return n
+ case 8:
+ mulAvxGFNI_10x8Xor(matrix, in, out, start, n)
+ return n
+ case 9:
+ mulAvxGFNI_10x9Xor(matrix, in, out, start, n)
+ return n
+ case 10:
+ mulAvxGFNI_10x10Xor(matrix, in, out, start, n)
+ return n
+ }
+ }
+ panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
new file mode 100644
index 000000000..fb5a3b654
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
@@ -0,0 +1,91 @@
+//go:build (!amd64 || noasm || appengine || gccgo) && (!arm64 || noasm || appengine || gccgo || nopshufb) && (!ppc64le || noasm || appengine || gccgo || nopshufb)
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package reedsolomon
+
+const pshufb = false
+
+func galMulSlice(c byte, in, out []byte, o *options) {
+ out = out[:len(in)]
+ if c == 1 {
+ copy(out, in)
+ return
+ }
+ mt := mulTable[c][:256]
+ for n, input := range in {
+ out[n] = mt[input]
+ }
+}
+
+func galMulSliceXor(c byte, in, out []byte, o *options) {
+ out = out[:len(in)]
+ if c == 1 {
+ sliceXor(in, out, o)
+ return
+ }
+ mt := mulTable[c][:256]
+ for n, input := range in {
+ out[n] ^= mt[input]
+ }
+}
+
+func init() {
+ defaultOptions.useAVX512 = false
+}
+
+// 4-way butterfly
+func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 2-way butterfly forward
+func fftDIT2(x, y []byte, log_m ffe, o *options) {
+ // Reference version:
+ refMulAdd(x, y, log_m)
+ sliceXorGo(x, y, o)
+}
+
+// 2-way butterfly forward
+func fftDIT28(x, y []byte, log_m ffe8, o *options) {
+ // Reference version:
+ refMulAdd8(x, y, log_m)
+ sliceXorGo(x, y, o)
+}
+
+// 2-way butterfly inverse
+func ifftDIT2(x, y []byte, log_m ffe, o *options) {
+ // Reference version:
+ sliceXorGo(x, y, o)
+ refMulAdd(x, y, log_m)
+}
+
+// 2-way butterfly inverse
+func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
+ // Reference version:
+ sliceXorGo(x, y, o)
+ refMulAdd8(x, y, log_m)
+}
+
+func mulgf16(x, y []byte, log_m ffe, o *options) {
+ refMul(x, y, log_m)
+}
+
+func mulgf8(x, y []byte, log_m ffe8, o *options) {
+ refMul8(x, y, log_m)
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_nopshufb_amd64.go
new file mode 100644
index 000000000..89c74e242
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_nopshufb_amd64.go
@@ -0,0 +1,146 @@
+// Copyright 2015, Klaus Post, see LICENSE for details
+
+//go:build nopshufb && !noasm
+
+package reedsolomon
+
+// bigSwitchover is the size where 64 bytes are processed per loop.
+const bigSwitchover = 128
+
+const pshufb = false
+
+// simple slice xor
+func sliceXor(in, out []byte, o *options) {
+ if o.useSSE2 {
+ if len(in) >= bigSwitchover {
+ if o.useAVX2 {
+ avx2XorSlice_64(in, out)
+ done := (len(in) >> 6) << 6
+ in = in[done:]
+ out = out[done:]
+ } else {
+ sSE2XorSlice_64(in, out)
+ done := (len(in) >> 6) << 6
+ in = in[done:]
+ out = out[done:]
+ }
+ }
+ if len(in) >= 16 {
+ sSE2XorSlice(in, out)
+ done := (len(in) >> 4) << 4
+ in = in[done:]
+ out = out[done:]
+ }
+ } else {
+ sliceXorGo(in, out, o)
+ return
+ }
+ out = out[:len(in)]
+ for i := range in {
+ out[i] ^= in[i]
+ }
+}
+
+func galMulSlice(c byte, in, out []byte, o *options) {
+ out = out[:len(in)]
+ if c == 1 {
+ copy(out, in)
+ return
+ }
+ mt := mulTable[c][:256]
+ for len(in) >= 4 {
+ ii := (*[4]byte)(in)
+ oo := (*[4]byte)(out)
+ oo[0] = mt[ii[0]]
+ oo[1] = mt[ii[1]]
+ oo[2] = mt[ii[2]]
+ oo[3] = mt[ii[3]]
+ in = in[4:]
+ out = out[4:]
+ }
+ for n, input := range in {
+ out[n] = mt[input]
+ }
+}
+
+func galMulSliceXor(c byte, in, out []byte, o *options) {
+ out = out[:len(in)]
+ if c == 1 {
+ sliceXor(in, out, o)
+ return
+ }
+ mt := mulTable[c][:256]
+ for len(in) >= 4 {
+ ii := (*[4]byte)(in)
+ oo := (*[4]byte)(out)
+ oo[0] ^= mt[ii[0]]
+ oo[1] ^= mt[ii[1]]
+ oo[2] ^= mt[ii[2]]
+ oo[3] ^= mt[ii[3]]
+ in = in[4:]
+ out = out[4:]
+ }
+ for n, input := range in {
+ out[n] ^= mt[input]
+ }
+}
+
+func init() {
+ defaultOptions.useAVX512 = false
+}
+
+// 4-way butterfly
+func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 2-way butterfly forward
+func fftDIT2(x, y []byte, log_m ffe, o *options) {
+ // Reference version:
+ refMulAdd(x, y, log_m)
+ sliceXor(x, y, o)
+}
+
+// 2-way butterfly forward
+func fftDIT28(x, y []byte, log_m ffe8, o *options) {
+ // Reference version:
+ refMulAdd8(x, y, log_m)
+ sliceXor(x, y, o)
+}
+
+// 2-way butterfly inverse
+func ifftDIT2(x, y []byte, log_m ffe, o *options) {
+ // Reference version:
+ sliceXor(x, y, o)
+ refMulAdd(x, y, log_m)
+}
+
+// 2-way butterfly inverse
+func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
+ // Reference version:
+ sliceXor(x, y, o)
+ refMulAdd8(x, y, log_m)
+}
+
+func mulgf16(x, y []byte, log_m ffe, o *options) {
+ refMul(x, y, log_m)
+}
+
+func mulgf8(x, y []byte, log_m ffe8, o *options) {
+ refMul8(x, y, log_m)
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go b/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go
new file mode 100644
index 000000000..f98bfed11
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go
@@ -0,0 +1,13 @@
+//go:build !amd64 || noasm || appengine || gccgo || pshufb
+
+// Copyright 2020, Klaus Post, see LICENSE for details.
+
+package reedsolomon
+
+func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) {
+ panic("codeSomeShardsAvx512 should not be called if built without asm")
+}
+
+func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) {
+ panic("codeSomeShardsAvx512P should not be called if built without asm")
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go
new file mode 100644
index 000000000..c4c80351f
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go
@@ -0,0 +1,146 @@
+//go:build !noasm && !appengine && !gccgo && !nopshufb
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2018, Minio, Inc.
+
+package reedsolomon
+
+const pshufb = true
+
+//go:noescape
+func galMulPpc(low, high, in, out []byte)
+
+//go:noescape
+func galMulPpcXor(low, high, in, out []byte)
+
+// This is what the assembler routines do in blocks of 16 bytes:
+/*
+func galMulPpc(low, high, in, out []byte) {
+ for n, input := range in {
+ l := input & 0xf
+ h := input >> 4
+ out[n] = low[l] ^ high[h]
+ }
+}
+func galMulPpcXor(low, high, in, out []byte) {
+ for n, input := range in {
+ l := input & 0xf
+ h := input >> 4
+ out[n] ^= low[l] ^ high[h]
+ }
+}
+*/
+
+func galMulSlice(c byte, in, out []byte, o *options) {
+ if c == 1 {
+ copy(out, in)
+ return
+ }
+ done := (len(in) >> 4) << 4
+ if done > 0 {
+ galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
+ }
+ remain := len(in) - done
+ if remain > 0 {
+ mt := mulTable[c][:256]
+ for i := done; i < len(in); i++ {
+ out[i] = mt[in[i]]
+ }
+ }
+}
+
+func galMulSliceXor(c byte, in, out []byte, o *options) {
+ if c == 1 {
+ sliceXor(in, out, o)
+ return
+ }
+ done := (len(in) >> 4) << 4
+ if done > 0 {
+ galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
+ }
+ remain := len(in) - done
+ if remain > 0 {
+ mt := mulTable[c][:256]
+ for i := done; i < len(in); i++ {
+ out[i] ^= mt[in[i]]
+ }
+ }
+}
+
+// 4-way butterfly
+func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 4-way butterfly
+func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
+}
+
+// 2-way butterfly forward
+func fftDIT2(x, y []byte, log_m ffe, o *options) {
+ // Reference version:
+ refMulAdd(x, y, log_m)
+ sliceXorGo(x, y, o)
+}
+
+// 2-way butterfly forward
+func fftDIT28(x, y []byte, log_m ffe8, o *options) {
+ // Reference version:
+ mulAdd8(x, y, log_m, o)
+ sliceXorGo(x, y, o)
+}
+
+// 2-way butterfly inverse
+func ifftDIT2(x, y []byte, log_m ffe, o *options) {
+ // Reference version:
+ sliceXorGo(x, y, o)
+ refMulAdd(x, y, log_m)
+}
+
+// 2-way butterfly inverse
+func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
+ // Reference version:
+ sliceXorGo(x, y, o)
+ mulAdd8(x, y, log_m, o)
+}
+
+func mulgf16(x, y []byte, log_m ffe, o *options) {
+ refMul(x, y, log_m)
+}
+
+func mulAdd8(out, in []byte, log_m ffe8, o *options) {
+ t := &multiply256LUT8[log_m]
+ galMulPpcXor(t[:16], t[16:32], in, out)
+ done := (len(in) >> 4) << 4
+ in = in[done:]
+ if len(in) > 0 {
+ out = out[done:]
+ refMulAdd8(in, out, log_m)
+ }
+}
+
+func mulgf8(out, in []byte, log_m ffe8, o *options) {
+ var done int
+ t := &multiply256LUT8[log_m]
+ galMulPpc(t[:16], t[16:32], in, out)
+ done = (len(in) >> 4) << 4
+
+ remain := len(in) - done
+ if remain > 0 {
+ mt := mul8LUTs[log_m].Value[:]
+ for i := done; i < len(in); i++ {
+ out[i] ^= byte(mt[in[i]])
+ }
+ }
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s
new file mode 100644
index 000000000..c585c2b64
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s
@@ -0,0 +1,127 @@
+//+build !noasm
+//+build !appengine
+//+build !gccgo
+//+build !pshufb
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2018, Minio, Inc.
+
+#include "textflag.h"
+
+#define LOW R3
+#define HIGH R4
+#define IN R5
+#define LEN R6
+#define OUT R7
+#define CONSTANTS R8
+#define OFFSET R9
+#define OFFSET1 R10
+#define OFFSET2 R11
+
+#define X6 VS34
+#define X6_ V2
+#define X7 VS35
+#define X7_ V3
+#define MSG VS36
+#define MSG_ V4
+#define MSG_HI VS37
+#define MSG_HI_ V5
+#define RESULT VS38
+#define RESULT_ V6
+#define ROTATE VS39
+#define ROTATE_ V7
+#define MASK VS40
+#define MASK_ V8
+#define FLIP VS41
+#define FLIP_ V9
+
+// func galMulPpc(low, high, in, out []byte)
+TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96
+ MOVD low+0(FP), LOW
+ MOVD high+24(FP), HIGH
+ MOVD in+48(FP), IN
+ MOVD in_len+56(FP), LEN
+ MOVD out+72(FP), OUT
+
+ MOVD $16, OFFSET1
+ MOVD $32, OFFSET2
+
+ MOVD $·constants(SB), CONSTANTS
+ LXVD2X (CONSTANTS)(R0), ROTATE
+ LXVD2X (CONSTANTS)(OFFSET1), MASK
+ LXVD2X (CONSTANTS)(OFFSET2), FLIP
+
+ LXVD2X (LOW)(R0), X6
+ LXVD2X (HIGH)(R0), X7
+ VPERM X6_, V31, FLIP_, X6_
+ VPERM X7_, V31, FLIP_, X7_
+
+ MOVD $0, OFFSET
+
+loop:
+ LXVD2X (IN)(OFFSET), MSG
+
+ VSRB MSG_, ROTATE_, MSG_HI_
+ VAND MSG_, MASK_, MSG_
+ VPERM X6_, V31, MSG_, MSG_
+ VPERM X7_, V31, MSG_HI_, MSG_HI_
+
+ VXOR MSG_, MSG_HI_, MSG_
+
+ STXVD2X MSG, (OUT)(OFFSET)
+
+ ADD $16, OFFSET, OFFSET
+ CMP LEN, OFFSET
+ BGT loop
+ RET
+
+// func galMulPpcXorlow, high, in, out []byte)
+TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96
+ MOVD low+0(FP), LOW
+ MOVD high+24(FP), HIGH
+ MOVD in+48(FP), IN
+ MOVD in_len+56(FP), LEN
+ MOVD out+72(FP), OUT
+
+ MOVD $16, OFFSET1
+ MOVD $32, OFFSET2
+
+ MOVD $·constants(SB), CONSTANTS
+ LXVD2X (CONSTANTS)(R0), ROTATE
+ LXVD2X (CONSTANTS)(OFFSET1), MASK
+ LXVD2X (CONSTANTS)(OFFSET2), FLIP
+
+ LXVD2X (LOW)(R0), X6
+ LXVD2X (HIGH)(R0), X7
+ VPERM X6_, V31, FLIP_, X6_
+ VPERM X7_, V31, FLIP_, X7_
+
+ MOVD $0, OFFSET
+
+loopXor:
+ LXVD2X (IN)(OFFSET), MSG
+ LXVD2X (OUT)(OFFSET), RESULT
+
+ VSRB MSG_, ROTATE_, MSG_HI_
+ VAND MSG_, MASK_, MSG_
+ VPERM X6_, V31, MSG_, MSG_
+ VPERM X7_, V31, MSG_HI_, MSG_HI_
+
+ VXOR MSG_, MSG_HI_, MSG_
+ VXOR MSG_, RESULT_, RESULT_
+
+ STXVD2X RESULT, (OUT)(OFFSET)
+
+ ADD $16, OFFSET, OFFSET
+ CMP LEN, OFFSET
+ BGT loopXor
+ RET
+
+DATA ·constants+0x0(SB)/8, $0x0404040404040404
+DATA ·constants+0x8(SB)/8, $0x0404040404040404
+DATA ·constants+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA ·constants+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA ·constants+0x20(SB)/8, $0x0706050403020100
+DATA ·constants+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+
+GLOBL ·constants(SB), 8, $48
diff --git a/vendor/github.com/klauspost/reedsolomon/inversion_tree.go b/vendor/github.com/klauspost/reedsolomon/inversion_tree.go
new file mode 100644
index 000000000..3f97f810a
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/inversion_tree.go
@@ -0,0 +1,164 @@
+/**
+ * A thread-safe tree which caches inverted matrices.
+ *
+ * Copyright 2016, Peter Collins
+ */
+
+package reedsolomon
+
+import (
+ "errors"
+ "sync"
+)
+
+// The tree uses a Reader-Writer mutex to make it thread-safe
+// when accessing cached matrices and inserting new ones.
+type inversionTree struct {
+ mutex sync.RWMutex
+ root inversionNode
+}
+
+type inversionNode struct {
+ matrix matrix
+ children []*inversionNode
+}
+
+// newInversionTree initializes a tree for storing inverted matrices.
+// Note that the root node is the identity matrix as it implies
+// there were no errors with the original data.
+func newInversionTree(dataShards, parityShards int) *inversionTree {
+ identity, _ := identityMatrix(dataShards)
+ return &inversionTree{
+ root: inversionNode{
+ matrix: identity,
+ children: make([]*inversionNode, dataShards+parityShards),
+ },
+ }
+}
+
+// GetInvertedMatrix returns the cached inverted matrix or nil if it
+// is not found in the tree keyed on the indices of invalid rows.
+func (t *inversionTree) GetInvertedMatrix(invalidIndices []int) matrix {
+ if t == nil {
+ return nil
+ }
+ // Lock the tree for reading before accessing the tree.
+ t.mutex.RLock()
+ defer t.mutex.RUnlock()
+
+ // If no invalid indices were give we should return the root
+ // identity matrix.
+ if len(invalidIndices) == 0 {
+ return t.root.matrix
+ }
+
+ // Recursively search for the inverted matrix in the tree, passing in
+ // 0 as the parent index as we start at the root of the tree.
+ return t.root.getInvertedMatrix(invalidIndices, 0)
+}
+
+// errAlreadySet is returned if the root node matrix is overwritten
+var errAlreadySet = errors.New("the root node identity matrix is already set")
+
+// InsertInvertedMatrix inserts a new inverted matrix into the tree
+// keyed by the indices of invalid rows. The total number of shards
+// is required for creating the proper length lists of child nodes for
+// each node.
+func (t *inversionTree) InsertInvertedMatrix(invalidIndices []int, matrix matrix, shards int) error {
+ if t == nil {
+ return nil
+ }
+ // If no invalid indices were given then we are done because the
+ // root node is already set with the identity matrix.
+ if len(invalidIndices) == 0 {
+ return errAlreadySet
+ }
+
+ if !matrix.IsSquare() {
+ return errNotSquare
+ }
+
+ // Lock the tree for writing and reading before accessing the tree.
+ t.mutex.Lock()
+ defer t.mutex.Unlock()
+
+ // Recursively create nodes for the inverted matrix in the tree until
+ // we reach the node to insert the matrix to. We start by passing in
+ // 0 as the parent index as we start at the root of the tree.
+ t.root.insertInvertedMatrix(invalidIndices, matrix, shards, 0)
+
+ return nil
+}
+
+func (n *inversionNode) getInvertedMatrix(invalidIndices []int, parent int) matrix {
+ // Get the child node to search next from the list of children. The
+ // list of children starts relative to the parent index passed in
+ // because the indices of invalid rows is sorted (by default). As we
+ // search recursively, the first invalid index gets popped off the list,
+ // so when searching through the list of children, use that first invalid
+ // index to find the child node.
+ firstIndex := invalidIndices[0]
+ node := n.children[firstIndex-parent]
+
+ // If the child node doesn't exist in the list yet, fail fast by
+ // returning, so we can construct and insert the proper inverted matrix.
+ if node == nil {
+ return nil
+ }
+
+ // If there's more than one invalid index left in the list we should
+ // keep searching recursively.
+ if len(invalidIndices) > 1 {
+ // Search recursively on the child node by passing in the invalid indices
+ // with the first index popped off the front. Also the parent index to
+ // pass down is the first index plus one.
+ return node.getInvertedMatrix(invalidIndices[1:], firstIndex+1)
+ }
+ // If there aren't any more invalid indices to search, we've found our
+ // node. Return it, however keep in mind that the matrix could still be
+ // nil because intermediary nodes in the tree are created sometimes with
+ // their inversion matrices uninitialized.
+ return node.matrix
+}
+
+func (n *inversionNode) insertInvertedMatrix(invalidIndices []int, matrix matrix, shards, parent int) {
+ // As above, get the child node to search next from the list of children.
+ // The list of children starts relative to the parent index passed in
+ // because the indices of invalid rows is sorted (by default). As we
+ // search recursively, the first invalid index gets popped off the list,
+ // so when searching through the list of children, use that first invalid
+ // index to find the child node.
+ firstIndex := invalidIndices[0]
+ node := n.children[firstIndex-parent]
+
+ // If the child node doesn't exist in the list yet, create a new
+ // node because we have the writer lock and add it to the list
+ // of children.
+ if node == nil {
+ // Make the length of the list of children equal to the number
+ // of shards minus the first invalid index because the list of
+ // invalid indices is sorted, so only this length of errors
+ // are possible in the tree.
+ node = &inversionNode{
+ children: make([]*inversionNode, shards-firstIndex),
+ }
+ // Insert the new node into the tree at the first index relative
+ // to the parent index that was given in this recursive call.
+ n.children[firstIndex-parent] = node
+ }
+
+ // If there's more than one invalid index left in the list we should
+ // keep searching recursively in order to find the node to add our
+ // matrix.
+ if len(invalidIndices) > 1 {
+ // As above, search recursively on the child node by passing in
+ // the invalid indices with the first index popped off the front.
+ // Also the total number of shards and parent index are passed down
+ // which is equal to the first index plus one.
+ node.insertInvertedMatrix(invalidIndices[1:], matrix, shards, firstIndex+1)
+ } else {
+ // If there aren't any more invalid indices to search, we've found our
+ // node. Cache the inverted matrix in this node.
+ node.matrix = matrix
+ }
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/leopard.go b/vendor/github.com/klauspost/reedsolomon/leopard.go
new file mode 100644
index 000000000..6b4c80184
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/leopard.go
@@ -0,0 +1,1262 @@
+package reedsolomon
+
+// This is a O(n*log n) implementation of Reed-Solomon
+// codes, ported from the C++ library https://github.com/catid/leopard.
+//
+// The implementation is based on the paper
+//
+// S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
+// "Novel Polynomial Basis with Fast Fourier Transform
+// and Its Application to Reed-Solomon Erasure Codes"
+// IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
+
+import (
+ "bytes"
+ "io"
+ "math/bits"
+ "sync"
+ "unsafe"
+
+ "github.com/klauspost/cpuid/v2"
+)
+
+// leopardFF16 is like reedSolomon but for more than 256 total shards.
+type leopardFF16 struct {
+ dataShards int // Number of data shards, should not be modified.
+ parityShards int // Number of parity shards, should not be modified.
+ totalShards int // Total number of shards. Calculated, and should not be modified.
+
+ workPool sync.Pool
+
+ o options
+}
+
+// newFF16 is like New, but for more than 256 total shards.
+func newFF16(dataShards, parityShards int, opt options) (*leopardFF16, error) {
+ initConstants()
+
+ if dataShards <= 0 || parityShards <= 0 {
+ return nil, ErrInvShardNum
+ }
+
+ if dataShards+parityShards > 65536 {
+ return nil, ErrMaxShardNum
+ }
+
+ r := &leopardFF16{
+ dataShards: dataShards,
+ parityShards: parityShards,
+ totalShards: dataShards + parityShards,
+ o: opt,
+ }
+ return r, nil
+}
+
+var _ = Extensions(&leopardFF16{})
+
+func (r *leopardFF16) ShardSizeMultiple() int {
+ return 64
+}
+
+func (r *leopardFF16) DataShards() int {
+ return r.dataShards
+}
+
+func (r *leopardFF16) ParityShards() int {
+ return r.parityShards
+}
+
+func (r *leopardFF16) TotalShards() int {
+ return r.totalShards
+}
+
+func (r *leopardFF16) AllocAligned(each int) [][]byte {
+ return AllocAligned(r.totalShards, each)
+}
+
+type ffe uint16
+
+const (
+ bitwidth = 16
+ order = 1 << bitwidth
+ modulus = order - 1
+ polynomial = 0x1002D
+)
+
+var (
+ fftSkew *[modulus]ffe
+ logWalsh *[order]ffe
+)
+
+// Logarithm Tables
+var (
+ logLUT *[order]ffe
+ expLUT *[order]ffe
+)
+
+// Stores the partial products of x * y at offset x + y * 65536
+// Repeated accesses from the same y value are faster
+var mul16LUTs *[order]mul16LUT
+
+type mul16LUT struct {
+ // Contains Lo product as a single lookup.
+ // Should be XORed with Hi lookup for result.
+ Lo [256]ffe
+ Hi [256]ffe
+}
+
+// Stores lookup for avx2
+var multiply256LUT *[order][8 * 16]byte
+
+func (r *leopardFF16) Encode(shards [][]byte) error {
+ if len(shards) != r.totalShards {
+ return ErrTooFewShards
+ }
+
+ if err := checkShards(shards, false); err != nil {
+ return err
+ }
+ return r.encode(shards)
+}
+
+func (r *leopardFF16) encode(shards [][]byte) error {
+ shardSize := shardSize(shards)
+ if shardSize%64 != 0 {
+ return ErrInvalidShardSize
+ }
+
+ m := ceilPow2(r.parityShards)
+ var work [][]byte
+ if w, ok := r.workPool.Get().([][]byte); ok {
+ work = w
+ }
+ if cap(work) >= m*2 {
+ work = work[:m*2]
+ } else {
+ work = AllocAligned(m*2, shardSize)
+ }
+ for i := range work {
+ if cap(work[i]) < shardSize {
+ work[i] = AllocAligned(1, shardSize)[0]
+ } else {
+ work[i] = work[i][:shardSize]
+ }
+ }
+ defer r.workPool.Put(work)
+
+ mtrunc := m
+ if r.dataShards < mtrunc {
+ mtrunc = r.dataShards
+ }
+
+ skewLUT := fftSkew[m-1:]
+
+ sh := shards
+ ifftDITEncoder(
+ sh[:r.dataShards],
+ mtrunc,
+ work,
+ nil, // No xor output
+ m,
+ skewLUT,
+ &r.o,
+ )
+
+ lastCount := r.dataShards % m
+ if m >= r.dataShards {
+ goto skip_body
+ }
+
+ // For sets of m data pieces:
+ for i := m; i+m <= r.dataShards; i += m {
+ sh = sh[m:]
+ skewLUT = skewLUT[m:]
+
+ // work <- work xor IFFT(data + i, m, m + i)
+
+ ifftDITEncoder(
+ sh, // data source
+ m,
+ work[m:], // temporary workspace
+ work, // xor destination
+ m,
+ skewLUT,
+ &r.o,
+ )
+ }
+
+ // Handle final partial set of m pieces:
+ if lastCount != 0 {
+ sh = sh[m:]
+ skewLUT = skewLUT[m:]
+
+ // work <- work xor IFFT(data + i, m, m + i)
+
+ ifftDITEncoder(
+ sh, // data source
+ lastCount,
+ work[m:], // temporary workspace
+ work, // xor destination
+ m,
+ skewLUT,
+ &r.o,
+ )
+ }
+
+skip_body:
+ // work <- FFT(work, m, 0)
+ fftDIT(work, r.parityShards, m, fftSkew[:], &r.o)
+
+ for i, w := range work[:r.parityShards] {
+ sh := shards[i+r.dataShards]
+ if cap(sh) >= shardSize {
+ sh = append(sh[:0], w...)
+ } else {
+ sh = w
+ }
+ shards[i+r.dataShards] = sh
+ }
+
+ return nil
+}
+
+func (r *leopardFF16) EncodeIdx(dataShard []byte, idx int, parity [][]byte) error {
+ return ErrNotSupported
+}
+
+func (r *leopardFF16) Join(dst io.Writer, shards [][]byte, outSize int) error {
+ // Do we have enough shards?
+ if len(shards) < r.dataShards {
+ return ErrTooFewShards
+ }
+ shards = shards[:r.dataShards]
+
+ // Do we have enough data?
+ size := 0
+ for _, shard := range shards {
+ if shard == nil {
+ return ErrReconstructRequired
+ }
+ size += len(shard)
+
+ // Do we have enough data already?
+ if size >= outSize {
+ break
+ }
+ }
+ if size < outSize {
+ return ErrShortData
+ }
+
+ // Copy data to dst
+ write := outSize
+ for _, shard := range shards {
+ if write < len(shard) {
+ _, err := dst.Write(shard[:write])
+ return err
+ }
+ n, err := dst.Write(shard)
+ if err != nil {
+ return err
+ }
+ write -= n
+ }
+ return nil
+}
+
+func (r *leopardFF16) Update(shards [][]byte, newDatashards [][]byte) error {
+ return ErrNotSupported
+}
+
+func (r *leopardFF16) Split(data []byte) ([][]byte, error) {
+ if len(data) == 0 {
+ return nil, ErrShortData
+ }
+ if r.totalShards == 1 && len(data)&63 == 0 {
+ return [][]byte{data}, nil
+ }
+ dataLen := len(data)
+ // Calculate number of bytes per data shard.
+ perShard := (len(data) + r.dataShards - 1) / r.dataShards
+ perShard = ((perShard + 63) / 64) * 64
+ needTotal := r.totalShards * perShard
+
+ if cap(data) > len(data) {
+ if cap(data) > needTotal {
+ data = data[:needTotal]
+ } else {
+ data = data[:cap(data)]
+ }
+ clear := data[dataLen:]
+ for i := range clear {
+ clear[i] = 0
+ }
+ }
+
+ // Only allocate memory if necessary
+ var padding [][]byte
+ if len(data) < needTotal {
+ // calculate maximum number of full shards in `data` slice
+ fullShards := len(data) / perShard
+ padding = AllocAligned(r.totalShards-fullShards, perShard)
+ if dataLen > perShard*fullShards {
+ // Copy partial shards
+ copyFrom := data[perShard*fullShards : dataLen]
+ for i := range padding {
+ if len(copyFrom) == 0 {
+ break
+ }
+ copyFrom = copyFrom[copy(padding[i], copyFrom):]
+ }
+ }
+ } else {
+ zero := data[dataLen : r.totalShards*perShard]
+ for i := range zero {
+ zero[i] = 0
+ }
+ }
+
+ // Split into equal-length shards.
+ dst := make([][]byte, r.totalShards)
+ i := 0
+ for ; i < len(dst) && len(data) >= perShard; i++ {
+ dst[i] = data[:perShard:perShard]
+ data = data[perShard:]
+ }
+
+ for j := 0; i+j < len(dst); j++ {
+ dst[i+j] = padding[0]
+ padding = padding[1:]
+ }
+
+ return dst, nil
+}
+
+func (r *leopardFF16) ReconstructSome(shards [][]byte, required []bool) error {
+ if len(required) == r.totalShards {
+ return r.reconstruct(shards, true)
+ }
+ return r.reconstruct(shards, false)
+}
+
+func (r *leopardFF16) Reconstruct(shards [][]byte) error {
+ return r.reconstruct(shards, true)
+}
+
+func (r *leopardFF16) ReconstructData(shards [][]byte) error {
+ return r.reconstruct(shards, false)
+}
+
+func (r *leopardFF16) Verify(shards [][]byte) (bool, error) {
+ if len(shards) != r.totalShards {
+ return false, ErrTooFewShards
+ }
+ if err := checkShards(shards, false); err != nil {
+ return false, err
+ }
+
+ // Re-encode parity shards to temporary storage.
+ shardSize := len(shards[0])
+ outputs := make([][]byte, r.totalShards)
+ copy(outputs, shards[:r.dataShards])
+ for i := r.dataShards; i < r.totalShards; i++ {
+ outputs[i] = make([]byte, shardSize)
+ }
+ if err := r.Encode(outputs); err != nil {
+ return false, err
+ }
+
+ // Compare.
+ for i := r.dataShards; i < r.totalShards; i++ {
+ if !bytes.Equal(outputs[i], shards[i]) {
+ return false, nil
+ }
+ }
+ return true, nil
+}
+
+func (r *leopardFF16) reconstruct(shards [][]byte, recoverAll bool) error {
+ if len(shards) != r.totalShards {
+ return ErrTooFewShards
+ }
+
+ if err := checkShards(shards, true); err != nil {
+ return err
+ }
+
+ // Quick check: are all of the shards present? If so, there's
+ // nothing to do.
+ numberPresent := 0
+ dataPresent := 0
+ for i := 0; i < r.totalShards; i++ {
+ if len(shards[i]) != 0 {
+ numberPresent++
+ if i < r.dataShards {
+ dataPresent++
+ }
+ }
+ }
+ if numberPresent == r.totalShards || !recoverAll && dataPresent == r.dataShards {
+ // Cool. All of the shards have data. We don't
+ // need to do anything.
+ return nil
+ }
+
+ // Use only if we are missing less than 1/4 parity.
+ useBits := r.totalShards-numberPresent <= r.parityShards/4
+
+ // Check if we have enough to reconstruct.
+ if numberPresent < r.dataShards {
+ return ErrTooFewShards
+ }
+
+ shardSize := shardSize(shards)
+ if shardSize%64 != 0 {
+ return ErrInvalidShardSize
+ }
+
+ m := ceilPow2(r.parityShards)
+ n := ceilPow2(m + r.dataShards)
+
+ const LEO_ERROR_BITFIELD_OPT = true
+
+ // Fill in error locations.
+ var errorBits errorBitfield
+ var errLocs [order]ffe
+ for i := 0; i < r.parityShards; i++ {
+ if len(shards[i+r.dataShards]) == 0 {
+ errLocs[i] = 1
+ if LEO_ERROR_BITFIELD_OPT && recoverAll {
+ errorBits.set(i)
+ }
+ }
+ }
+ for i := r.parityShards; i < m; i++ {
+ errLocs[i] = 1
+ if LEO_ERROR_BITFIELD_OPT && recoverAll {
+ errorBits.set(i)
+ }
+ }
+ for i := 0; i < r.dataShards; i++ {
+ if len(shards[i]) == 0 {
+ errLocs[i+m] = 1
+ if LEO_ERROR_BITFIELD_OPT {
+ errorBits.set(i + m)
+ }
+ }
+ }
+
+ if LEO_ERROR_BITFIELD_OPT && useBits {
+ errorBits.prepare()
+ }
+
+ // Evaluate error locator polynomial
+ fwht(&errLocs, order, m+r.dataShards)
+
+ for i := 0; i < order; i++ {
+ errLocs[i] = ffe((uint(errLocs[i]) * uint(logWalsh[i])) % modulus)
+ }
+
+ fwht(&errLocs, order, order)
+
+ var work [][]byte
+ if w, ok := r.workPool.Get().([][]byte); ok {
+ work = w
+ }
+ if cap(work) >= n {
+ work = work[:n]
+ } else {
+ work = make([][]byte, n)
+ }
+ for i := range work {
+ if cap(work[i]) < shardSize {
+ work[i] = make([]byte, shardSize)
+ } else {
+ work[i] = work[i][:shardSize]
+ }
+ }
+ defer r.workPool.Put(work)
+
+ // work <- recovery data
+
+ for i := 0; i < r.parityShards; i++ {
+ if len(shards[i+r.dataShards]) != 0 {
+ mulgf16(work[i], shards[i+r.dataShards], errLocs[i], &r.o)
+ } else {
+ memclr(work[i])
+ }
+ }
+ for i := r.parityShards; i < m; i++ {
+ memclr(work[i])
+ }
+
+ // work <- original data
+
+ for i := 0; i < r.dataShards; i++ {
+ if len(shards[i]) != 0 {
+ mulgf16(work[m+i], shards[i], errLocs[m+i], &r.o)
+ } else {
+ memclr(work[m+i])
+ }
+ }
+ for i := m + r.dataShards; i < n; i++ {
+ memclr(work[i])
+ }
+
+ // work <- IFFT(work, n, 0)
+
+ ifftDITDecoder(
+ m+r.dataShards,
+ work,
+ n,
+ fftSkew[:],
+ &r.o,
+ )
+
+ // work <- FormalDerivative(work, n)
+
+ for i := 1; i < n; i++ {
+ width := ((i ^ (i - 1)) + 1) >> 1
+ slicesXor(work[i-width:i], work[i:i+width], &r.o)
+ }
+
+ // work <- FFT(work, n, 0) truncated to m + dataShards
+
+ outputCount := m + r.dataShards
+
+ if LEO_ERROR_BITFIELD_OPT && useBits {
+ errorBits.fftDIT(work, outputCount, n, fftSkew[:], &r.o)
+ } else {
+ fftDIT(work, outputCount, n, fftSkew[:], &r.o)
+ }
+
+ // Reveal erasures
+ //
+ // Original = -ErrLocator * FFT( Derivative( IFFT( ErrLocator * ReceivedData ) ) )
+ // mul_mem(x, y, log_m, ) equals x[] = y[] * log_m
+ //
+ // mem layout: [Recovery Data (Power of Two = M)] [Original Data (K)] [Zero Padding out to N]
+ end := r.dataShards
+ if recoverAll {
+ end = r.totalShards
+ }
+ for i := 0; i < end; i++ {
+ if len(shards[i]) != 0 {
+ continue
+ }
+ if cap(shards[i]) >= shardSize {
+ shards[i] = shards[i][:shardSize]
+ } else {
+ shards[i] = make([]byte, shardSize)
+ }
+ if i >= r.dataShards {
+ // Parity shard.
+ mulgf16(shards[i], work[i-r.dataShards], modulus-errLocs[i-r.dataShards], &r.o)
+ } else {
+ // Data shard.
+ mulgf16(shards[i], work[i+m], modulus-errLocs[i+m], &r.o)
+ }
+ }
+ return nil
+}
+
+// Basic no-frills version for decoder
+func ifftDITDecoder(mtrunc int, work [][]byte, m int, skewLUT []ffe, o *options) {
+ // Decimation in time: Unroll 2 layers at a time
+ dist := 1
+ dist4 := 4
+ for dist4 <= m {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ iend := r + dist
+ log_m01 := skewLUT[iend-1]
+ log_m02 := skewLUT[iend+dist-1]
+ log_m23 := skewLUT[iend+dist*2-1]
+
+ // For each set of dist elements:
+ for i := r; i < iend; i++ {
+ ifftDIT4(work[i:], dist, log_m01, log_m23, log_m02, o)
+ }
+ }
+ dist = dist4
+ dist4 <<= 2
+ }
+
+ // If there is one layer left:
+ if dist < m {
+ // Assuming that dist = m / 2
+ if dist*2 != m {
+ panic("internal error")
+ }
+
+ log_m := skewLUT[dist-1]
+
+ if log_m == modulus {
+ slicesXor(work[dist:2*dist], work[:dist], o)
+ } else {
+ for i := 0; i < dist; i++ {
+ ifftDIT2(
+ work[i],
+ work[i+dist],
+ log_m,
+ o,
+ )
+ }
+ }
+ }
+}
+
+// In-place FFT for encoder and decoder
+func fftDIT(work [][]byte, mtrunc, m int, skewLUT []ffe, o *options) {
+ // Decimation in time: Unroll 2 layers at a time
+ dist4 := m
+ dist := m >> 2
+ for dist != 0 {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ iend := r + dist
+ log_m01 := skewLUT[iend-1]
+ log_m02 := skewLUT[iend+dist-1]
+ log_m23 := skewLUT[iend+dist*2-1]
+
+ // For each set of dist elements:
+ for i := r; i < iend; i++ {
+ fftDIT4(
+ work[i:],
+ dist,
+ log_m01,
+ log_m23,
+ log_m02,
+ o,
+ )
+ }
+ }
+ dist4 = dist
+ dist >>= 2
+ }
+
+ // If there is one layer left:
+ if dist4 == 2 {
+ for r := 0; r < mtrunc; r += 2 {
+ log_m := skewLUT[r+1-1]
+
+ if log_m == modulus {
+ sliceXor(work[r], work[r+1], o)
+ } else {
+ fftDIT2(work[r], work[r+1], log_m, o)
+ }
+ }
+ }
+}
+
+// 4-way butterfly
+func fftDIT4Ref(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ // First layer:
+ if log_m02 == modulus {
+ sliceXor(work[0], work[dist*2], o)
+ sliceXor(work[dist], work[dist*3], o)
+ } else {
+ fftDIT2(work[0], work[dist*2], log_m02, o)
+ fftDIT2(work[dist], work[dist*3], log_m02, o)
+ }
+
+ // Second layer:
+ if log_m01 == modulus {
+ sliceXor(work[0], work[dist], o)
+ } else {
+ fftDIT2(work[0], work[dist], log_m01, o)
+ }
+
+ if log_m23 == modulus {
+ sliceXor(work[dist*2], work[dist*3], o)
+ } else {
+ fftDIT2(work[dist*2], work[dist*3], log_m23, o)
+ }
+}
+
+// Unrolled IFFT for encoder
+func ifftDITEncoder(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe, o *options) {
+ // I tried rolling the memcpy/memset into the first layer of the FFT and
+ // found that it only yields a 4% performance improvement, which is not
+ // worth the extra complexity.
+ for i := 0; i < mtrunc; i++ {
+ copy(work[i], data[i])
+ }
+ for i := mtrunc; i < m; i++ {
+ memclr(work[i])
+ }
+
+ // I tried splitting up the first few layers into L3-cache sized blocks but
+ // found that it only provides about 5% performance boost, which is not
+ // worth the extra complexity.
+
+ // Decimation in time: Unroll 2 layers at a time
+ dist := 1
+ dist4 := 4
+ for dist4 <= m {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ iend := r + dist
+ log_m01 := skewLUT[iend]
+ log_m02 := skewLUT[iend+dist]
+ log_m23 := skewLUT[iend+dist*2]
+
+ // For each set of dist elements:
+ for i := r; i < iend; i++ {
+ ifftDIT4(
+ work[i:],
+ dist,
+ log_m01,
+ log_m23,
+ log_m02,
+ o,
+ )
+ }
+ }
+
+ dist = dist4
+ dist4 <<= 2
+ // I tried alternating sweeps left->right and right->left to reduce cache misses.
+ // It provides about 1% performance boost when done for both FFT and IFFT, so it
+ // does not seem to be worth the extra complexity.
+ }
+
+ // If there is one layer left:
+ if dist < m {
+ // Assuming that dist = m / 2
+ if dist*2 != m {
+ panic("internal error")
+ }
+
+ logm := skewLUT[dist]
+
+ if logm == modulus {
+ slicesXor(work[dist:dist*2], work[:dist], o)
+ } else {
+ for i := 0; i < dist; i++ {
+ ifftDIT2(work[i], work[i+dist], logm, o)
+ }
+ }
+ }
+
+ // I tried unrolling this but it does not provide more than 5% performance
+ // improvement for 16-bit finite fields, so it's not worth the complexity.
+ if xorRes != nil {
+ slicesXor(xorRes[:m], work[:m], o)
+ }
+}
+
+func ifftDIT4Ref(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+ // First layer:
+ if log_m01 == modulus {
+ sliceXor(work[0], work[dist], o)
+ } else {
+ ifftDIT2(work[0], work[dist], log_m01, o)
+ }
+
+ if log_m23 == modulus {
+ sliceXor(work[dist*2], work[dist*3], o)
+ } else {
+ ifftDIT2(work[dist*2], work[dist*3], log_m23, o)
+ }
+
+ // Second layer:
+ if log_m02 == modulus {
+ sliceXor(work[0], work[dist*2], o)
+ sliceXor(work[dist], work[dist*3], o)
+ } else {
+ ifftDIT2(work[0], work[dist*2], log_m02, o)
+ ifftDIT2(work[dist], work[dist*3], log_m02, o)
+ }
+}
+
+// Reference version of muladd: x[] ^= y[] * log_m
+func refMulAdd(x, y []byte, log_m ffe) {
+ lut := &mul16LUTs[log_m]
+
+ for len(x) >= 64 {
+ // Assert sizes for no bounds checks in loop
+ hiA := y[32:64]
+ loA := y[:32]
+ dst := x[:64] // Needed, but not checked...
+ for i, lo := range loA {
+ hi := hiA[i]
+ prod := lut.Lo[lo] ^ lut.Hi[hi]
+
+ dst[i] ^= byte(prod)
+ dst[i+32] ^= byte(prod >> 8)
+ }
+ x = x[64:]
+ y = y[64:]
+ }
+}
+
+func memclr(s []byte) {
+ for i := range s {
+ s[i] = 0
+ }
+}
+
+// slicesXor calls xor for every slice pair in v1, v2.
+func slicesXor(v1, v2 [][]byte, o *options) {
+ for i, v := range v1 {
+ sliceXor(v2[i], v, o)
+ }
+}
+
+// Reference version of mul: x[] = y[] * log_m
+func refMul(x, y []byte, log_m ffe) {
+ lut := &mul16LUTs[log_m]
+
+ for off := 0; off < len(x); off += 64 {
+ loA := y[off : off+32]
+ hiA := y[off+32:]
+ hiA = hiA[:len(loA)]
+ for i, lo := range loA {
+ hi := hiA[i]
+ prod := lut.Lo[lo] ^ lut.Hi[hi]
+
+ x[off+i] = byte(prod)
+ x[off+i+32] = byte(prod >> 8)
+ }
+ }
+}
+
+// Returns a * Log(b)
+func mulLog(a, log_b ffe) ffe {
+ /*
+ Note that this operation is not a normal multiplication in a finite
+ field because the right operand is already a logarithm. This is done
+ because it moves K table lookups from the Decode() method into the
+ initialization step that is less performance critical. The LogWalsh[]
+ table below contains precalculated logarithms so it is easier to do
+ all the other multiplies in that form as well.
+ */
+ if a == 0 {
+ return 0
+ }
+ return expLUT[addMod(logLUT[a], log_b)]
+}
+
+// z = x + y (mod kModulus)
+func addMod(a, b ffe) ffe {
+ sum := uint(a) + uint(b)
+
+ // Partial reduction step, allowing for kModulus to be returned
+ return ffe(sum + sum>>bitwidth)
+}
+
+// z = x - y (mod kModulus)
+func subMod(a, b ffe) ffe {
+ dif := uint(a) - uint(b)
+
+ // Partial reduction step, allowing for kModulus to be returned
+ return ffe(dif + dif>>bitwidth)
+}
+
+// ceilPow2 returns power of two at or above n.
+func ceilPow2(n int) int {
+ const w = int(unsafe.Sizeof(n) * 8)
+ return 1 << (w - bits.LeadingZeros(uint(n-1)))
+}
+
+// Decimation in time (DIT) Fast Walsh-Hadamard Transform
+// Unrolls pairs of layers to perform cross-layer operations in registers
+// mtrunc: Number of elements that are non-zero at the front of data
+func fwht(data *[order]ffe, m, mtrunc int) {
+ // Decimation in time: Unroll 2 layers at a time
+ dist := 1
+ dist4 := 4
+ for dist4 <= m {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ // For each set of dist elements:
+ // Use 16 bit indices to avoid bounds check on [65536]ffe.
+ dist := uint16(dist)
+ off := uint16(r)
+ for i := uint16(0); i < dist; i++ {
+ // fwht4(data[i:], dist) inlined...
+ // Reading values appear faster than updating pointers.
+ // Casting to uint is not faster.
+ t0 := data[off]
+ t1 := data[off+dist]
+ t2 := data[off+dist*2]
+ t3 := data[off+dist*3]
+
+ t0, t1 = fwht2alt(t0, t1)
+ t2, t3 = fwht2alt(t2, t3)
+ t0, t2 = fwht2alt(t0, t2)
+ t1, t3 = fwht2alt(t1, t3)
+
+ data[off] = t0
+ data[off+dist] = t1
+ data[off+dist*2] = t2
+ data[off+dist*3] = t3
+ off++
+ }
+ }
+ dist = dist4
+ dist4 <<= 2
+ }
+
+ // If there is one layer left:
+ if dist < m {
+ dist := uint16(dist)
+ for i := uint16(0); i < dist; i++ {
+ fwht2(&data[i], &data[i+dist])
+ }
+ }
+}
+
+func fwht4(data []ffe, s int) {
+ s2 := s << 1
+
+ t0 := &data[0]
+ t1 := &data[s]
+ t2 := &data[s2]
+ t3 := &data[s2+s]
+
+ fwht2(t0, t1)
+ fwht2(t2, t3)
+ fwht2(t0, t2)
+ fwht2(t1, t3)
+}
+
+// {a, b} = {a + b, a - b} (Mod Q)
+func fwht2(a, b *ffe) {
+ sum := addMod(*a, *b)
+ dif := subMod(*a, *b)
+ *a = sum
+ *b = dif
+}
+
+// fwht2alt is as fwht2, but returns result.
+func fwht2alt(a, b ffe) (ffe, ffe) {
+ return addMod(a, b), subMod(a, b)
+}
+
+var initOnce sync.Once
+
+func initConstants() {
+ initOnce.Do(func() {
+ initLUTs()
+ initFFTSkew()
+ initMul16LUT()
+ })
+}
+
+// Initialize logLUT, expLUT.
+func initLUTs() {
+ cantorBasis := [bitwidth]ffe{
+ 0x0001, 0xACCA, 0x3C0E, 0x163E,
+ 0xC582, 0xED2E, 0x914C, 0x4012,
+ 0x6C98, 0x10D8, 0x6A72, 0xB900,
+ 0xFDB8, 0xFB34, 0xFF38, 0x991E,
+ }
+
+ expLUT = &[order]ffe{}
+ logLUT = &[order]ffe{}
+
+ // LFSR table generation:
+ state := 1
+ for i := ffe(0); i < modulus; i++ {
+ expLUT[state] = i
+ state <<= 1
+ if state >= order {
+ state ^= polynomial
+ }
+ }
+ expLUT[0] = modulus
+
+ // Conversion to Cantor basis:
+
+ logLUT[0] = 0
+ for i := 0; i < bitwidth; i++ {
+ basis := cantorBasis[i]
+ width := 1 << i
+
+ for j := 0; j < width; j++ {
+ logLUT[j+width] = logLUT[j] ^ basis
+ }
+ }
+
+ for i := 0; i < order; i++ {
+ logLUT[i] = expLUT[logLUT[i]]
+ }
+
+ for i := 0; i < order; i++ {
+ expLUT[logLUT[i]] = ffe(i)
+ }
+
+ expLUT[modulus] = expLUT[0]
+}
+
+// Initialize fftSkew.
+func initFFTSkew() {
+ var temp [bitwidth - 1]ffe
+
+ // Generate FFT skew vector {1}:
+
+ for i := 1; i < bitwidth; i++ {
+ temp[i-1] = ffe(1 << i)
+ }
+
+ fftSkew = &[modulus]ffe{}
+ logWalsh = &[order]ffe{}
+
+ for m := 0; m < bitwidth-1; m++ {
+ step := 1 << (m + 1)
+
+ fftSkew[1<>4)+16)]
+ lut.Hi[i] = tmp[((i&15)+32)] ^ tmp[((i>>4)+48)]
+ }
+ }
+ if cpuid.CPU.Has(cpuid.SSSE3) || cpuid.CPU.Has(cpuid.AVX2) || cpuid.CPU.Has(cpuid.AVX512F) {
+ multiply256LUT = &[order][16 * 8]byte{}
+
+ for logM := range multiply256LUT[:] {
+ // For each 4 bits of the finite field width in bits:
+ shift := 0
+ for i := 0; i < 4; i++ {
+ // Construct 16 entry LUT for PSHUFB
+ prodLo := multiply256LUT[logM][i*16 : i*16+16]
+ prodHi := multiply256LUT[logM][4*16+i*16 : 4*16+i*16+16]
+ for x := range prodLo[:] {
+ prod := mulLog(ffe(x<> 8)
+ }
+ shift += 4
+ }
+ }
+ }
+}
+
+const kWordMips = 5
+const kWords = order / 64
+const kBigMips = 6
+const kBigWords = (kWords + 63) / 64
+const kBiggestMips = 4
+
+// errorBitfield contains progressive errors to help indicate which
+// shards need reconstruction.
+type errorBitfield struct {
+ Words [kWordMips][kWords]uint64
+ BigWords [kBigMips][kBigWords]uint64
+ BiggestWords [kBiggestMips]uint64
+}
+
+func (e *errorBitfield) set(i int) {
+ e.Words[0][i/64] |= uint64(1) << (i & 63)
+}
+
+func (e *errorBitfield) isNeededFn(mipLevel int) func(bit int) bool {
+ if mipLevel >= 16 {
+ return func(bit int) bool {
+ return true
+ }
+ }
+ if mipLevel >= 12 {
+ w := e.BiggestWords[mipLevel-12]
+ return func(bit int) bool {
+ bit /= 4096
+ return 0 != (w & (uint64(1) << bit))
+ }
+ }
+ if mipLevel >= 6 {
+ w := e.BigWords[mipLevel-6][:]
+ return func(bit int) bool {
+ bit /= 64
+ return 0 != (w[bit/64] & (uint64(1) << (bit & 63)))
+ }
+ }
+ if mipLevel > 0 {
+ w := e.Words[mipLevel-1][:]
+ return func(bit int) bool {
+ return 0 != (w[bit/64] & (uint64(1) << (bit & 63)))
+ }
+ }
+ return nil
+}
+
+func (e *errorBitfield) isNeeded(mipLevel int, bit uint) bool {
+ if mipLevel >= 16 {
+ return true
+ }
+ if mipLevel >= 12 {
+ bit /= 4096
+ return 0 != (e.BiggestWords[mipLevel-12] & (uint64(1) << bit))
+ }
+ if mipLevel >= 6 {
+ bit /= 64
+ return 0 != (e.BigWords[mipLevel-6][bit/64] & (uint64(1) << (bit % 64)))
+ }
+ return 0 != (e.Words[mipLevel-1][bit/64] & (uint64(1) << (bit % 64)))
+}
+
+var kHiMasks = [5]uint64{
+ 0xAAAAAAAAAAAAAAAA,
+ 0xCCCCCCCCCCCCCCCC,
+ 0xF0F0F0F0F0F0F0F0,
+ 0xFF00FF00FF00FF00,
+ 0xFFFF0000FFFF0000,
+}
+
+func (e *errorBitfield) prepare() {
+ // First mip level is for final layer of FFT: pairs of data
+ for i := 0; i < kWords; i++ {
+ w_i := e.Words[0][i]
+ hi2lo0 := w_i | ((w_i & kHiMasks[0]) >> 1)
+ lo2hi0 := (w_i & (kHiMasks[0] >> 1)) << 1
+ w_i = hi2lo0 | lo2hi0
+ e.Words[0][i] = w_i
+
+ bits := 2
+ for j := 1; j < kWordMips; j++ {
+ hi2lo_j := w_i | ((w_i & kHiMasks[j]) >> bits)
+ lo2hi_j := (w_i & (kHiMasks[j] >> bits)) << bits
+ w_i = hi2lo_j | lo2hi_j
+ e.Words[j][i] = w_i
+ bits <<= 1
+ }
+ }
+
+ for i := 0; i < kBigWords; i++ {
+ w_i := uint64(0)
+ bit := uint64(1)
+ src := e.Words[kWordMips-1][i*64 : i*64+64]
+ for _, w := range src {
+ w_i |= (w | (w >> 32) | (w << 32)) & bit
+ bit <<= 1
+ }
+ e.BigWords[0][i] = w_i
+
+ bits := 1
+ for j := 1; j < kBigMips; j++ {
+ hi2lo_j := w_i | ((w_i & kHiMasks[j-1]) >> bits)
+ lo2hi_j := (w_i & (kHiMasks[j-1] >> bits)) << bits
+ w_i = hi2lo_j | lo2hi_j
+ e.BigWords[j][i] = w_i
+ bits <<= 1
+ }
+ }
+
+ w_i := uint64(0)
+ bit := uint64(1)
+ for _, w := range e.BigWords[kBigMips-1][:kBigWords] {
+ w_i |= (w | (w >> 32) | (w << 32)) & bit
+ bit <<= 1
+ }
+ e.BiggestWords[0] = w_i
+
+ bits := uint64(1)
+ for j := 1; j < kBiggestMips; j++ {
+ hi2lo_j := w_i | ((w_i & kHiMasks[j-1]) >> bits)
+ lo2hi_j := (w_i & (kHiMasks[j-1] >> bits)) << bits
+ w_i = hi2lo_j | lo2hi_j
+ e.BiggestWords[j] = w_i
+ bits <<= 1
+ }
+}
+
+func (e *errorBitfield) fftDIT(work [][]byte, mtrunc, m int, skewLUT []ffe, o *options) {
+ // Decimation in time: Unroll 2 layers at a time
+ mipLevel := bits.Len32(uint32(m)) - 1
+
+ dist4 := m
+ dist := m >> 2
+ needed := e.isNeededFn(mipLevel)
+ for dist != 0 {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ if !needed(r) {
+ continue
+ }
+ iEnd := r + dist
+ logM01 := skewLUT[iEnd-1]
+ logM02 := skewLUT[iEnd+dist-1]
+ logM23 := skewLUT[iEnd+dist*2-1]
+
+ // For each set of dist elements:
+ for i := r; i < iEnd; i++ {
+ fftDIT4(
+ work[i:],
+ dist,
+ logM01,
+ logM23,
+ logM02,
+ o)
+ }
+ }
+ dist4 = dist
+ dist >>= 2
+ mipLevel -= 2
+ needed = e.isNeededFn(mipLevel)
+ }
+
+ // If there is one layer left:
+ if dist4 == 2 {
+ for r := 0; r < mtrunc; r += 2 {
+ if !needed(r) {
+ continue
+ }
+ logM := skewLUT[r+1-1]
+
+ if logM == modulus {
+ sliceXor(work[r], work[r+1], o)
+ } else {
+ fftDIT2(work[r], work[r+1], logM, o)
+ }
+ }
+ }
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/leopard8.go b/vendor/github.com/klauspost/reedsolomon/leopard8.go
new file mode 100644
index 000000000..cd863a136
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/leopard8.go
@@ -0,0 +1,1269 @@
+package reedsolomon
+
+// This is a O(n*log n) implementation of Reed-Solomon
+// codes, ported from the C++ library https://github.com/catid/leopard.
+//
+// The implementation is based on the paper
+//
+// S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
+// "Novel Polynomial Basis with Fast Fourier Transform
+// and Its Application to Reed-Solomon Erasure Codes"
+// IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
+
+import (
+ "bytes"
+ "encoding/binary"
+ "io"
+ "math/bits"
+ "sync"
+)
+
+// leopardFF8 is like reedSolomon but for the 8-bit "leopard" implementation.
+type leopardFF8 struct {
+ dataShards int // Number of data shards, should not be modified.
+ parityShards int // Number of parity shards, should not be modified.
+ totalShards int // Total number of shards. Calculated, and should not be modified.
+
+ workPool sync.Pool
+ inversion map[[inversion8Bytes]byte]leopardGF8cache
+ inversionMu sync.Mutex
+
+ o options
+}
+
+const inversion8Bytes = 256 / 8
+
+type leopardGF8cache struct {
+ errorLocs [256]ffe8
+ bits *errorBitfield8
+}
+
+// newFF8 is like New, but for the 8-bit "leopard" implementation.
+func newFF8(dataShards, parityShards int, opt options) (*leopardFF8, error) {
+ initConstants8()
+
+ if dataShards <= 0 || parityShards <= 0 {
+ return nil, ErrInvShardNum
+ }
+
+ if dataShards+parityShards > 65536 {
+ return nil, ErrMaxShardNum
+ }
+
+ r := &leopardFF8{
+ dataShards: dataShards,
+ parityShards: parityShards,
+ totalShards: dataShards + parityShards,
+ o: opt,
+ }
+ if opt.inversionCache && (r.totalShards <= 64 || opt.forcedInversionCache) {
+ // Inversion cache is relatively ineffective for big shard counts and takes up potentially lots of memory
+ // r.totalShards is not covering the space, but an estimate.
+ r.inversion = make(map[[inversion8Bytes]byte]leopardGF8cache, r.totalShards)
+ }
+ return r, nil
+}
+
+var _ = Extensions(&leopardFF8{})
+
+func (r *leopardFF8) ShardSizeMultiple() int {
+ return 64
+}
+
+func (r *leopardFF8) DataShards() int {
+ return r.dataShards
+}
+
+func (r *leopardFF8) ParityShards() int {
+ return r.parityShards
+}
+
+func (r *leopardFF8) TotalShards() int {
+ return r.totalShards
+}
+
+func (r *leopardFF8) AllocAligned(each int) [][]byte {
+ return AllocAligned(r.totalShards, each)
+}
+
+type ffe8 uint8
+
+const (
+ bitwidth8 = 8
+ order8 = 1 << bitwidth8
+ modulus8 = order8 - 1
+ polynomial8 = 0x11D
+
+ // Encode in blocks of this size.
+ workSize8 = 32 << 10
+)
+
+var (
+ fftSkew8 *[modulus8]ffe8
+ logWalsh8 *[order8]ffe8
+)
+
+// Logarithm Tables
+var (
+ logLUT8 *[order8]ffe8
+ expLUT8 *[order8]ffe8
+)
+
+// Stores the partial products of x * y at offset x + y * 256
+// Repeated accesses from the same y value are faster
+var mul8LUTs *[order8]mul8LUT
+
+type mul8LUT struct {
+ Value [256]ffe8
+}
+
+// Stores lookup for avx2
+var multiply256LUT8 *[order8][2 * 16]byte
+
+func (r *leopardFF8) Encode(shards [][]byte) error {
+ if len(shards) != r.totalShards {
+ return ErrTooFewShards
+ }
+
+ if err := checkShards(shards, false); err != nil {
+ return err
+ }
+ return r.encode(shards)
+}
+
+func (r *leopardFF8) encode(shards [][]byte) error {
+ shardSize := shardSize(shards)
+ if shardSize%64 != 0 {
+ return ErrInvalidShardSize
+ }
+
+ m := ceilPow2(r.parityShards)
+ var work [][]byte
+ if w, ok := r.workPool.Get().([][]byte); ok {
+ work = w
+ } else {
+ work = AllocAligned(m*2, workSize8)
+ }
+ if cap(work) >= m*2 {
+ work = work[:m*2]
+ for i := range work {
+ if i >= r.parityShards {
+ if cap(work[i]) < workSize8 {
+ work[i] = AllocAligned(1, workSize8)[0]
+ } else {
+ work[i] = work[i][:workSize8]
+ }
+ }
+ }
+ } else {
+ work = AllocAligned(m*2, workSize8)
+ }
+
+ defer r.workPool.Put(work)
+
+ mtrunc := m
+ if r.dataShards < mtrunc {
+ mtrunc = r.dataShards
+ }
+
+ skewLUT := fftSkew8[m-1:]
+
+ // Split large shards.
+ // More likely on lower shard count.
+ off := 0
+ sh := make([][]byte, len(shards))
+
+ // work slice we can modify
+ wMod := make([][]byte, len(work))
+ copy(wMod, work)
+ for off < shardSize {
+ work := wMod
+ sh := sh
+ end := off + workSize8
+ if end > shardSize {
+ end = shardSize
+ sz := shardSize - off
+ for i := range work {
+ // Last iteration only...
+ work[i] = work[i][:sz]
+ }
+ }
+ for i := range shards {
+ sh[i] = shards[i][off:end]
+ }
+
+ // Replace work slices, so we write directly to output.
+ // Note that work has parity *before* data shards.
+ res := shards[r.dataShards:r.totalShards]
+ for i := range res {
+ work[i] = res[i][off:end]
+ }
+
+ ifftDITEncoder8(
+ sh[:r.dataShards],
+ mtrunc,
+ work,
+ nil, // No xor output
+ m,
+ skewLUT,
+ &r.o,
+ )
+
+ lastCount := r.dataShards % m
+ skewLUT2 := skewLUT
+ if m >= r.dataShards {
+ goto skip_body
+ }
+
+ // For sets of m data pieces:
+ for i := m; i+m <= r.dataShards; i += m {
+ sh = sh[m:]
+ skewLUT2 = skewLUT2[m:]
+
+ // work <- work xor IFFT(data + i, m, m + i)
+
+ ifftDITEncoder8(
+ sh, // data source
+ m,
+ work[m:], // temporary workspace
+ work, // xor destination
+ m,
+ skewLUT2,
+ &r.o,
+ )
+ }
+
+ // Handle final partial set of m pieces:
+ if lastCount != 0 {
+ sh = sh[m:]
+ skewLUT2 = skewLUT2[m:]
+
+ // work <- work xor IFFT(data + i, m, m + i)
+
+ ifftDITEncoder8(
+ sh, // data source
+ lastCount,
+ work[m:], // temporary workspace
+ work, // xor destination
+ m,
+ skewLUT2,
+ &r.o,
+ )
+ }
+
+ skip_body:
+ // work <- FFT(work, m, 0)
+ fftDIT8(work, r.parityShards, m, fftSkew8[:], &r.o)
+ off += workSize8
+ }
+
+ return nil
+}
+
+func (r *leopardFF8) EncodeIdx(dataShard []byte, idx int, parity [][]byte) error {
+ return ErrNotSupported
+}
+
+func (r *leopardFF8) Join(dst io.Writer, shards [][]byte, outSize int) error {
+ // Do we have enough shards?
+ if len(shards) < r.dataShards {
+ return ErrTooFewShards
+ }
+ shards = shards[:r.dataShards]
+
+ // Do we have enough data?
+ size := 0
+ for _, shard := range shards {
+ if shard == nil {
+ return ErrReconstructRequired
+ }
+ size += len(shard)
+
+ // Do we have enough data already?
+ if size >= outSize {
+ break
+ }
+ }
+ if size < outSize {
+ return ErrShortData
+ }
+
+ // Copy data to dst
+ write := outSize
+ for _, shard := range shards {
+ if write < len(shard) {
+ _, err := dst.Write(shard[:write])
+ return err
+ }
+ n, err := dst.Write(shard)
+ if err != nil {
+ return err
+ }
+ write -= n
+ }
+ return nil
+}
+
+func (r *leopardFF8) Update(shards [][]byte, newDatashards [][]byte) error {
+ return ErrNotSupported
+}
+
+func (r *leopardFF8) Split(data []byte) ([][]byte, error) {
+ if len(data) == 0 {
+ return nil, ErrShortData
+ }
+ if r.totalShards == 1 && len(data)&63 == 0 {
+ return [][]byte{data}, nil
+ }
+
+ dataLen := len(data)
+ // Calculate number of bytes per data shard.
+ perShard := (len(data) + r.dataShards - 1) / r.dataShards
+ perShard = ((perShard + 63) / 64) * 64
+ needTotal := r.totalShards * perShard
+
+ if cap(data) > len(data) {
+ if cap(data) > needTotal {
+ data = data[:needTotal]
+ } else {
+ data = data[:cap(data)]
+ }
+ clear := data[dataLen:]
+ for i := range clear {
+ clear[i] = 0
+ }
+ }
+
+ // Only allocate memory if necessary
+ var padding [][]byte
+ if len(data) < needTotal {
+ // calculate maximum number of full shards in `data` slice
+ fullShards := len(data) / perShard
+ padding = AllocAligned(r.totalShards-fullShards, perShard)
+ if dataLen > perShard*fullShards {
+ // Copy partial shards
+ copyFrom := data[perShard*fullShards : dataLen]
+ for i := range padding {
+ if len(copyFrom) == 0 {
+ break
+ }
+ copyFrom = copyFrom[copy(padding[i], copyFrom):]
+ }
+ }
+ }
+
+ // Split into equal-length shards.
+ dst := make([][]byte, r.totalShards)
+ i := 0
+ for ; i < len(dst) && len(data) >= perShard; i++ {
+ dst[i] = data[:perShard:perShard]
+ data = data[perShard:]
+ }
+
+ for j := 0; i+j < len(dst); j++ {
+ dst[i+j] = padding[0]
+ padding = padding[1:]
+ }
+
+ return dst, nil
+}
+
+func (r *leopardFF8) ReconstructSome(shards [][]byte, required []bool) error {
+ if len(required) == r.totalShards {
+ return r.reconstruct(shards, true)
+ }
+ return r.reconstruct(shards, false)
+}
+
+func (r *leopardFF8) Reconstruct(shards [][]byte) error {
+ return r.reconstruct(shards, true)
+}
+
+func (r *leopardFF8) ReconstructData(shards [][]byte) error {
+ return r.reconstruct(shards, false)
+}
+
+func (r *leopardFF8) Verify(shards [][]byte) (bool, error) {
+ if len(shards) != r.totalShards {
+ return false, ErrTooFewShards
+ }
+ if err := checkShards(shards, false); err != nil {
+ return false, err
+ }
+
+ // Re-encode parity shards to temporary storage.
+ shardSize := len(shards[0])
+ outputs := make([][]byte, r.totalShards)
+ copy(outputs, shards[:r.dataShards])
+ for i := r.dataShards; i < r.totalShards; i++ {
+ outputs[i] = make([]byte, shardSize)
+ }
+ if err := r.Encode(outputs); err != nil {
+ return false, err
+ }
+
+ // Compare.
+ for i := r.dataShards; i < r.totalShards; i++ {
+ if !bytes.Equal(outputs[i], shards[i]) {
+ return false, nil
+ }
+ }
+ return true, nil
+}
+
+func (r *leopardFF8) reconstruct(shards [][]byte, recoverAll bool) error {
+ if len(shards) != r.totalShards {
+ return ErrTooFewShards
+ }
+
+ if err := checkShards(shards, true); err != nil {
+ return err
+ }
+
+ // Quick check: are all of the shards present? If so, there's
+ // nothing to do.
+ numberPresent := 0
+ dataPresent := 0
+ for i := 0; i < r.totalShards; i++ {
+ if len(shards[i]) != 0 {
+ numberPresent++
+ if i < r.dataShards {
+ dataPresent++
+ }
+ }
+ }
+ if numberPresent == r.totalShards || !recoverAll && dataPresent == r.dataShards {
+ // Cool. All of the shards have data. We don't
+ // need to do anything.
+ return nil
+ }
+
+ // Check if we have enough to reconstruct.
+ if numberPresent < r.dataShards {
+ return ErrTooFewShards
+ }
+
+ shardSize := shardSize(shards)
+ if shardSize%64 != 0 {
+ return ErrInvalidShardSize
+ }
+
+ // Use only if we are missing less than 1/4 parity,
+ // And we are restoring a significant amount of data.
+ useBits := r.totalShards-numberPresent <= r.parityShards/4 && shardSize*r.totalShards >= 64<<10
+
+ m := ceilPow2(r.parityShards)
+ n := ceilPow2(m + r.dataShards)
+
+ const LEO_ERROR_BITFIELD_OPT = true
+
+ // Fill in error locations.
+ var errorBits errorBitfield8
+ var errLocs [order8]ffe8
+ for i := 0; i < r.parityShards; i++ {
+ if len(shards[i+r.dataShards]) == 0 {
+ errLocs[i] = 1
+ if LEO_ERROR_BITFIELD_OPT && recoverAll {
+ errorBits.set(i)
+ }
+ }
+ }
+ for i := r.parityShards; i < m; i++ {
+ errLocs[i] = 1
+ if LEO_ERROR_BITFIELD_OPT && recoverAll {
+ errorBits.set(i)
+ }
+ }
+ for i := 0; i < r.dataShards; i++ {
+ if len(shards[i]) == 0 {
+ errLocs[i+m] = 1
+ if LEO_ERROR_BITFIELD_OPT {
+ errorBits.set(i + m)
+ }
+ }
+ }
+
+ var gotInversion bool
+ if LEO_ERROR_BITFIELD_OPT && r.inversion != nil {
+ cacheID := errorBits.cacheID()
+ r.inversionMu.Lock()
+ if inv, ok := r.inversion[cacheID]; ok {
+ r.inversionMu.Unlock()
+ errLocs = inv.errorLocs
+ if inv.bits != nil && useBits {
+ errorBits = *inv.bits
+ useBits = true
+ } else {
+ useBits = false
+ }
+ gotInversion = true
+ } else {
+ r.inversionMu.Unlock()
+ }
+ }
+
+ if !gotInversion {
+ // No inversion...
+ if LEO_ERROR_BITFIELD_OPT && useBits {
+ errorBits.prepare()
+ }
+
+ // Evaluate error locator polynomial8
+ fwht8(&errLocs, order8, m+r.dataShards)
+
+ for i := 0; i < order8; i++ {
+ errLocs[i] = ffe8((uint(errLocs[i]) * uint(logWalsh8[i])) % modulus8)
+ }
+
+ fwht8(&errLocs, order8, order8)
+
+ if r.inversion != nil {
+ c := leopardGF8cache{
+ errorLocs: errLocs,
+ }
+ if useBits {
+ // Heap alloc
+ var x errorBitfield8
+ x = errorBits
+ c.bits = &x
+ }
+ r.inversionMu.Lock()
+ r.inversion[errorBits.cacheID()] = c
+ r.inversionMu.Unlock()
+ }
+ }
+
+ var work [][]byte
+ if w, ok := r.workPool.Get().([][]byte); ok {
+ work = w
+ }
+ if cap(work) >= n {
+ work = work[:n]
+ for i := range work {
+ if cap(work[i]) < workSize8 {
+ work[i] = make([]byte, workSize8)
+ } else {
+ work[i] = work[i][:workSize8]
+ }
+ }
+
+ } else {
+ work = make([][]byte, n)
+ all := make([]byte, n*workSize8)
+ for i := range work {
+ work[i] = all[i*workSize8 : i*workSize8+workSize8]
+ }
+ }
+ defer r.workPool.Put(work)
+
+ // work <- recovery data
+
+ // Split large shards.
+ // More likely on lower shard count.
+ sh := make([][]byte, len(shards))
+ // Copy...
+ copy(sh, shards)
+
+ // Add output
+ for i, sh := range shards {
+ if !recoverAll && i >= r.dataShards {
+ continue
+ }
+ if len(sh) == 0 {
+ if cap(sh) >= shardSize {
+ shards[i] = sh[:shardSize]
+ } else {
+ shards[i] = make([]byte, shardSize)
+ }
+ }
+ }
+
+ off := 0
+ for off < shardSize {
+ endSlice := off + workSize8
+ if endSlice > shardSize {
+ endSlice = shardSize
+ sz := shardSize - off
+ // Last iteration only
+ for i := range work {
+ work[i] = work[i][:sz]
+ }
+ }
+ for i := range shards {
+ if len(sh[i]) != 0 {
+ sh[i] = shards[i][off:endSlice]
+ }
+ }
+ for i := 0; i < r.parityShards; i++ {
+ if len(sh[i+r.dataShards]) != 0 {
+ mulgf8(work[i], sh[i+r.dataShards], errLocs[i], &r.o)
+ } else {
+ memclr(work[i])
+ }
+ }
+ for i := r.parityShards; i < m; i++ {
+ memclr(work[i])
+ }
+
+ // work <- original data
+
+ for i := 0; i < r.dataShards; i++ {
+ if len(sh[i]) != 0 {
+ mulgf8(work[m+i], sh[i], errLocs[m+i], &r.o)
+ } else {
+ memclr(work[m+i])
+ }
+ }
+ for i := m + r.dataShards; i < n; i++ {
+ memclr(work[i])
+ }
+
+ // work <- IFFT(work, n, 0)
+
+ ifftDITDecoder8(
+ m+r.dataShards,
+ work,
+ n,
+ fftSkew8[:],
+ &r.o,
+ )
+
+ // work <- FormalDerivative(work, n)
+
+ for i := 1; i < n; i++ {
+ width := ((i ^ (i - 1)) + 1) >> 1
+ slicesXor(work[i-width:i], work[i:i+width], &r.o)
+ }
+
+ // work <- FFT(work, n, 0) truncated to m + dataShards
+
+ outputCount := m + r.dataShards
+
+ if LEO_ERROR_BITFIELD_OPT && useBits {
+ errorBits.fftDIT8(work, outputCount, n, fftSkew8[:], &r.o)
+ } else {
+ fftDIT8(work, outputCount, n, fftSkew8[:], &r.o)
+ }
+
+ // Reveal erasures
+ //
+ // Original = -ErrLocator * FFT( Derivative( IFFT( ErrLocator * ReceivedData ) ) )
+ // mul_mem(x, y, log_m, ) equals x[] = y[] * log_m
+ //
+ // mem layout: [Recovery Data (Power of Two = M)] [Original Data (K)] [Zero Padding out to N]
+ end := r.dataShards
+ if recoverAll {
+ end = r.totalShards
+ }
+ // Restore
+ for i := 0; i < end; i++ {
+ if len(sh[i]) != 0 {
+ continue
+ }
+
+ if i >= r.dataShards {
+ // Parity shard.
+ mulgf8(shards[i][off:endSlice], work[i-r.dataShards], modulus8-errLocs[i-r.dataShards], &r.o)
+ } else {
+ // Data shard.
+ mulgf8(shards[i][off:endSlice], work[i+m], modulus8-errLocs[i+m], &r.o)
+ }
+ }
+ off += workSize8
+ }
+ return nil
+}
+
+// Basic no-frills version for decoder
+func ifftDITDecoder8(mtrunc int, work [][]byte, m int, skewLUT []ffe8, o *options) {
+ // Decimation in time: Unroll 2 layers at a time
+ dist := 1
+ dist4 := 4
+ for dist4 <= m {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ iend := r + dist
+ log_m01 := skewLUT[iend-1]
+ log_m02 := skewLUT[iend+dist-1]
+ log_m23 := skewLUT[iend+dist*2-1]
+
+ // For each set of dist elements:
+ for i := r; i < iend; i++ {
+ ifftDIT48(work[i:], dist, log_m01, log_m23, log_m02, o)
+ }
+ }
+ dist = dist4
+ dist4 <<= 2
+ }
+
+ // If there is one layer left:
+ if dist < m {
+ // Assuming that dist = m / 2
+ if dist*2 != m {
+ panic("internal error")
+ }
+
+ log_m := skewLUT[dist-1]
+
+ if log_m == modulus8 {
+ slicesXor(work[dist:2*dist], work[:dist], o)
+ } else {
+ for i := 0; i < dist; i++ {
+ ifftDIT28(
+ work[i],
+ work[i+dist],
+ log_m,
+ o,
+ )
+ }
+ }
+ }
+}
+
+// In-place FFT for encoder and decoder
+func fftDIT8(work [][]byte, mtrunc, m int, skewLUT []ffe8, o *options) {
+ // Decimation in time: Unroll 2 layers at a time
+ dist4 := m
+ dist := m >> 2
+ for dist != 0 {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ iend := r + dist
+ log_m01 := skewLUT[iend-1]
+ log_m02 := skewLUT[iend+dist-1]
+ log_m23 := skewLUT[iend+dist*2-1]
+
+ // For each set of dist elements:
+ for i := r; i < iend; i++ {
+ fftDIT48(
+ work[i:],
+ dist,
+ log_m01,
+ log_m23,
+ log_m02,
+ o,
+ )
+ }
+ }
+ dist4 = dist
+ dist >>= 2
+ }
+
+ // If there is one layer left:
+ if dist4 == 2 {
+ for r := 0; r < mtrunc; r += 2 {
+ log_m := skewLUT[r+1-1]
+
+ if log_m == modulus8 {
+ sliceXor(work[r], work[r+1], o)
+ } else {
+ fftDIT28(work[r], work[r+1], log_m, o)
+ }
+ }
+ }
+}
+
+// 4-way butterfly
+func fftDIT4Ref8(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ // First layer:
+ if log_m02 == modulus8 {
+ sliceXor(work[0], work[dist*2], o)
+ sliceXor(work[dist], work[dist*3], o)
+ } else {
+ fftDIT28(work[0], work[dist*2], log_m02, o)
+ fftDIT28(work[dist], work[dist*3], log_m02, o)
+ }
+
+ // Second layer:
+ if log_m01 == modulus8 {
+ sliceXor(work[0], work[dist], o)
+ } else {
+ fftDIT28(work[0], work[dist], log_m01, o)
+ }
+
+ if log_m23 == modulus8 {
+ sliceXor(work[dist*2], work[dist*3], o)
+ } else {
+ fftDIT28(work[dist*2], work[dist*3], log_m23, o)
+ }
+}
+
+// Unrolled IFFT for encoder
+func ifftDITEncoder8(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe8, o *options) {
+ // I tried rolling the memcpy/memset into the first layer of the FFT and
+ // found that it only yields a 4% performance improvement, which is not
+ // worth the extra complexity.
+ for i := 0; i < mtrunc; i++ {
+ copy(work[i], data[i])
+ }
+ for i := mtrunc; i < m; i++ {
+ memclr(work[i])
+ }
+
+ // Decimation in time: Unroll 2 layers at a time
+ dist := 1
+ dist4 := 4
+ for dist4 <= m {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ iend := r + dist
+ log_m01 := skewLUT[iend]
+ log_m02 := skewLUT[iend+dist]
+ log_m23 := skewLUT[iend+dist*2]
+
+ // For each set of dist elements:
+ for i := r; i < iend; i++ {
+ ifftDIT48(
+ work[i:],
+ dist,
+ log_m01,
+ log_m23,
+ log_m02,
+ o,
+ )
+ }
+ }
+
+ dist = dist4
+ dist4 <<= 2
+ // I tried alternating sweeps left->right and right->left to reduce cache misses.
+ // It provides about 1% performance boost when done for both FFT and IFFT, so it
+ // does not seem to be worth the extra complexity.
+ }
+
+ // If there is one layer left:
+ if dist < m {
+ // Assuming that dist = m / 2
+ if dist*2 != m {
+ panic("internal error")
+ }
+
+ logm := skewLUT[dist]
+
+ if logm == modulus8 {
+ slicesXor(work[dist:dist*2], work[:dist], o)
+ } else {
+ for i := 0; i < dist; i++ {
+ ifftDIT28(work[i], work[i+dist], logm, o)
+ }
+ }
+ }
+
+ // I tried unrolling this but it does not provide more than 5% performance
+ // improvement for 16-bit finite fields, so it's not worth the complexity.
+ if xorRes != nil {
+ slicesXor(xorRes[:m], work[:m], o)
+ }
+}
+
+func ifftDIT4Ref8(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
+ // First layer:
+ if log_m01 == modulus8 {
+ sliceXor(work[0], work[dist], o)
+ } else {
+ ifftDIT28(work[0], work[dist], log_m01, o)
+ }
+
+ if log_m23 == modulus8 {
+ sliceXor(work[dist*2], work[dist*3], o)
+ } else {
+ ifftDIT28(work[dist*2], work[dist*3], log_m23, o)
+ }
+
+ // Second layer:
+ if log_m02 == modulus8 {
+ sliceXor(work[0], work[dist*2], o)
+ sliceXor(work[dist], work[dist*3], o)
+ } else {
+ ifftDIT28(work[0], work[dist*2], log_m02, o)
+ ifftDIT28(work[dist], work[dist*3], log_m02, o)
+ }
+}
+
+// Reference version of muladd: x[] ^= y[] * log_m
+func refMulAdd8(x, y []byte, log_m ffe8) {
+ lut := &mul8LUTs[log_m]
+
+ for len(x) >= 64 {
+ // Assert sizes for no bounds checks in loop
+ src := y[:64]
+ dst := x[:len(src)] // Needed, but not checked...
+ for i, y1 := range src {
+ dst[i] ^= byte(lut.Value[y1])
+ }
+ x = x[64:]
+ y = y[64:]
+ }
+}
+
+// Reference version of mul: x[] = y[] * log_m
+func refMul8(x, y []byte, log_m ffe8) {
+ lut := &mul8LUTs[log_m]
+
+ for off := 0; off < len(x); off += 64 {
+ src := y[off : off+64]
+ for i, y1 := range src {
+ x[off+i] = byte(lut.Value[y1])
+ }
+ }
+}
+
+// Returns a * Log(b)
+func mulLog8(a, log_b ffe8) ffe8 {
+ /*
+ Note that this operation is not a normal multiplication in a finite
+ field because the right operand is already a logarithm. This is done
+ because it moves K table lookups from the Decode() method into the
+ initialization step that is less performance critical. The LogWalsh[]
+ table below contains precalculated logarithms so it is easier to do
+ all the other multiplies in that form as well.
+ */
+ if a == 0 {
+ return 0
+ }
+ return expLUT8[addMod8(logLUT8[a], log_b)]
+}
+
+// z = x + y (mod kModulus)
+func addMod8(a, b ffe8) ffe8 {
+ sum := uint(a) + uint(b)
+
+ // Partial reduction step, allowing for kModulus to be returned
+ return ffe8(sum + sum>>bitwidth8)
+}
+
+// z = x - y (mod kModulus)
+func subMod8(a, b ffe8) ffe8 {
+ dif := uint(a) - uint(b)
+
+ // Partial reduction step, allowing for kModulus to be returned
+ return ffe8(dif + dif>>bitwidth8)
+}
+
+// Decimation in time (DIT) Fast Walsh-Hadamard Transform
+// Unrolls pairs of layers to perform cross-layer operations in registers
+// mtrunc: Number of elements that are non-zero at the front of data
+func fwht8(data *[order8]ffe8, m, mtrunc int) {
+ // Decimation in time: Unroll 2 layers at a time
+ dist := 1
+ dist4 := 4
+ for dist4 <= m {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ // For each set of dist elements:
+ // Use 16 bit indices to avoid bounds check on [65536]ffe8.
+ dist := uint16(dist)
+ off := uint16(r)
+ for i := uint16(0); i < dist; i++ {
+ // fwht48(data[i:], dist) inlined...
+ // Reading values appear faster than updating pointers.
+ // Casting to uint is not faster.
+ t0 := data[off]
+ t1 := data[off+dist]
+ t2 := data[off+dist*2]
+ t3 := data[off+dist*3]
+
+ t0, t1 = fwht2alt8(t0, t1)
+ t2, t3 = fwht2alt8(t2, t3)
+ t0, t2 = fwht2alt8(t0, t2)
+ t1, t3 = fwht2alt8(t1, t3)
+
+ data[off] = t0
+ data[off+dist] = t1
+ data[off+dist*2] = t2
+ data[off+dist*3] = t3
+ off++
+ }
+ }
+ dist = dist4
+ dist4 <<= 2
+ }
+
+ // If there is one layer left:
+ if dist < m {
+ dist := uint16(dist)
+ for i := uint16(0); i < dist; i++ {
+ fwht28(&data[i], &data[i+dist])
+ }
+ }
+}
+
+func fwht48(data []ffe8, s int) {
+ s2 := s << 1
+
+ t0 := &data[0]
+ t1 := &data[s]
+ t2 := &data[s2]
+ t3 := &data[s2+s]
+
+ fwht28(t0, t1)
+ fwht28(t2, t3)
+ fwht28(t0, t2)
+ fwht28(t1, t3)
+}
+
+// {a, b} = {a + b, a - b} (Mod Q)
+func fwht28(a, b *ffe8) {
+ sum := addMod8(*a, *b)
+ dif := subMod8(*a, *b)
+ *a = sum
+ *b = dif
+}
+
+// fwht2alt8 is as fwht28, but returns result.
+func fwht2alt8(a, b ffe8) (ffe8, ffe8) {
+ return addMod8(a, b), subMod8(a, b)
+}
+
+var initOnce8 sync.Once
+
+func initConstants8() {
+ initOnce8.Do(func() {
+ initLUTs8()
+ initFFTSkew8()
+ initMul8LUT()
+ })
+}
+
+// Initialize logLUT8, expLUT8.
+func initLUTs8() {
+ cantorBasis := [bitwidth8]ffe8{
+ 1, 214, 152, 146, 86, 200, 88, 230,
+ }
+
+ expLUT8 = &[order8]ffe8{}
+ logLUT8 = &[order8]ffe8{}
+
+ // LFSR table generation:
+ state := 1
+ for i := ffe8(0); i < modulus8; i++ {
+ expLUT8[state] = i
+ state <<= 1
+ if state >= order8 {
+ state ^= polynomial8
+ }
+ }
+ expLUT8[0] = modulus8
+
+ // Conversion to Cantor basis:
+
+ logLUT8[0] = 0
+ for i := 0; i < bitwidth8; i++ {
+ basis := cantorBasis[i]
+ width := 1 << i
+
+ for j := 0; j < width; j++ {
+ logLUT8[j+width] = logLUT8[j] ^ basis
+ }
+ }
+
+ for i := 0; i < order8; i++ {
+ logLUT8[i] = expLUT8[logLUT8[i]]
+ }
+
+ for i := 0; i < order8; i++ {
+ expLUT8[logLUT8[i]] = ffe8(i)
+ }
+
+ expLUT8[modulus8] = expLUT8[0]
+}
+
+// Initialize fftSkew8.
+func initFFTSkew8() {
+ var temp [bitwidth8 - 1]ffe8
+
+ // Generate FFT skew vector {1}:
+
+ for i := 1; i < bitwidth8; i++ {
+ temp[i-1] = ffe8(1 << i)
+ }
+
+ fftSkew8 = &[modulus8]ffe8{}
+ logWalsh8 = &[order8]ffe8{}
+
+ for m := 0; m < bitwidth8-1; m++ {
+ step := 1 << (m + 1)
+
+ fftSkew8[1<>4)+16)]
+ }
+ }
+ // Always initialize assembly tables.
+ // Not as big resource hog as gf16.
+ if true {
+ multiply256LUT8 = &[order8][16 * 2]byte{}
+
+ for logM := range multiply256LUT8[:] {
+ // For each 4 bits of the finite field width in bits:
+ shift := 0
+ for i := 0; i < 2; i++ {
+ // Construct 16 entry LUT for PSHUFB
+ prod := multiply256LUT8[logM][i*16 : i*16+16]
+ for x := range prod[:] {
+ prod[x] = byte(mulLog8(ffe8(x<= 8 || mipLevel <= 0 {
+ return true
+ }
+ return 0 != (e.Words[mipLevel-1][bit/64] & (uint64(1) << (bit & 63)))
+}
+
+func (e *errorBitfield8) prepare() {
+ // First mip level is for final layer of FFT: pairs of data
+ for i := 0; i < kWords8; i++ {
+ w_i := e.Words[0][i]
+ hi2lo0 := w_i | ((w_i & kHiMasks[0]) >> 1)
+ lo2hi0 := (w_i & (kHiMasks[0] >> 1)) << 1
+ w_i = hi2lo0 | lo2hi0
+ e.Words[0][i] = w_i
+
+ bits := 2
+ for j := 1; j < 5; j++ {
+ hi2lo_j := w_i | ((w_i & kHiMasks[j]) >> bits)
+ lo2hi_j := (w_i & (kHiMasks[j] >> bits)) << bits
+ w_i = hi2lo_j | lo2hi_j
+ e.Words[j][i] = w_i
+ bits <<= 1
+ }
+ }
+
+ for i := 0; i < kWords8; i++ {
+ w := e.Words[4][i]
+ w |= w >> 32
+ w |= w << 32
+ e.Words[5][i] = w
+ }
+
+ for i := 0; i < kWords8; i += 2 {
+ t := e.Words[5][i] | e.Words[5][i+1]
+ e.Words[6][i] = t
+ e.Words[6][i+1] = t
+ }
+}
+
+func (e *errorBitfield8) fftDIT8(work [][]byte, mtrunc, m int, skewLUT []ffe8, o *options) {
+ // Decimation in time: Unroll 2 layers at a time
+ mipLevel := bits.Len32(uint32(m)) - 1
+
+ dist4 := m
+ dist := m >> 2
+ for dist != 0 {
+ // For each set of dist*4 elements:
+ for r := 0; r < mtrunc; r += dist4 {
+ if !e.isNeeded(mipLevel, r) {
+ continue
+ }
+ iEnd := r + dist
+ logM01 := skewLUT[iEnd-1]
+ logM02 := skewLUT[iEnd+dist-1]
+ logM23 := skewLUT[iEnd+dist*2-1]
+
+ // For each set of dist elements:
+ for i := r; i < iEnd; i++ {
+ fftDIT48(
+ work[i:],
+ dist,
+ logM01,
+ logM23,
+ logM02,
+ o)
+ }
+ }
+ dist4 = dist
+ dist >>= 2
+ mipLevel -= 2
+ }
+
+ // If there is one layer left:
+ if dist4 == 2 {
+ for r := 0; r < mtrunc; r += 2 {
+ if !e.isNeeded(mipLevel, r) {
+ continue
+ }
+ logM := skewLUT[r+1-1]
+
+ if logM == modulus8 {
+ sliceXor(work[r], work[r+1], o)
+ } else {
+ fftDIT28(work[r], work[r+1], logM, o)
+ }
+ }
+ }
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/matrix.go b/vendor/github.com/klauspost/reedsolomon/matrix.go
new file mode 100644
index 000000000..bfdcca66f
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/matrix.go
@@ -0,0 +1,281 @@
+/**
+ * Matrix Algebra over an 8-bit Galois Field
+ *
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc.
+ */
+
+package reedsolomon
+
+import (
+ "errors"
+ "fmt"
+ "strconv"
+ "strings"
+)
+
+// byte[row][col]
+type matrix [][]byte
+
+// newMatrix returns a matrix of zeros.
+func newMatrix(rows, cols int) (matrix, error) {
+ if rows <= 0 {
+ return nil, errInvalidRowSize
+ }
+ if cols <= 0 {
+ return nil, errInvalidColSize
+ }
+
+ m := matrix(make([][]byte, rows))
+ for i := range m {
+ m[i] = make([]byte, cols)
+ }
+ return m, nil
+}
+
+// NewMatrixData initializes a matrix with the given row-major data.
+// Note that data is not copied from input.
+func newMatrixData(data [][]byte) (matrix, error) {
+ m := matrix(data)
+ err := m.Check()
+ if err != nil {
+ return nil, err
+ }
+ return m, nil
+}
+
+// IdentityMatrix returns an identity matrix of the given size.
+func identityMatrix(size int) (matrix, error) {
+ m, err := newMatrix(size, size)
+ if err != nil {
+ return nil, err
+ }
+ for i := range m {
+ m[i][i] = 1
+ }
+ return m, nil
+}
+
+// errInvalidRowSize will be returned if attempting to create a matrix with negative or zero row number.
+var errInvalidRowSize = errors.New("invalid row size")
+
+// errInvalidColSize will be returned if attempting to create a matrix with negative or zero column number.
+var errInvalidColSize = errors.New("invalid column size")
+
+// errColSizeMismatch is returned if the size of matrix columns mismatch.
+var errColSizeMismatch = errors.New("column size is not the same for all rows")
+
+func (m matrix) Check() error {
+ rows := len(m)
+ if rows == 0 {
+ return errInvalidRowSize
+ }
+ cols := len(m[0])
+ if cols == 0 {
+ return errInvalidColSize
+ }
+
+ for _, col := range m {
+ if len(col) != cols {
+ return errColSizeMismatch
+ }
+ }
+ return nil
+}
+
+// String returns a human-readable string of the matrix contents.
+//
+// Example: [[1, 2], [3, 4]]
+func (m matrix) String() string {
+ rowOut := make([]string, 0, len(m))
+ for _, row := range m {
+ colOut := make([]string, 0, len(row))
+ for _, col := range row {
+ colOut = append(colOut, strconv.Itoa(int(col)))
+ }
+ rowOut = append(rowOut, "["+strings.Join(colOut, ", ")+"]")
+ }
+ return "[" + strings.Join(rowOut, ", ") + "]"
+}
+
+// Multiply multiplies this matrix (the one on the left) by another
+// matrix (the one on the right) and returns a new matrix with the result.
+func (m matrix) Multiply(right matrix) (matrix, error) {
+ if len(m[0]) != len(right) {
+ return nil, fmt.Errorf("columns on left (%d) is different than rows on right (%d)", len(m[0]), len(right))
+ }
+ result, _ := newMatrix(len(m), len(right[0]))
+ for r, row := range result {
+ for c := range row {
+ var value byte
+ for i := range m[0] {
+ value ^= galMultiply(m[r][i], right[i][c])
+ }
+ result[r][c] = value
+ }
+ }
+ return result, nil
+}
+
+// Augment returns the concatenation of this matrix and the matrix on the right.
+func (m matrix) Augment(right matrix) (matrix, error) {
+ if len(m) != len(right) {
+ return nil, errMatrixSize
+ }
+
+ result, _ := newMatrix(len(m), len(m[0])+len(right[0]))
+ for r, row := range m {
+ for c := range row {
+ result[r][c] = m[r][c]
+ }
+ cols := len(m[0])
+ for c := range right[0] {
+ result[r][cols+c] = right[r][c]
+ }
+ }
+ return result, nil
+}
+
+// errMatrixSize is returned if matrix dimensions are doesn't match.
+var errMatrixSize = errors.New("matrix sizes do not match")
+
+func (m matrix) SameSize(n matrix) error {
+ if len(m) != len(n) {
+ return errMatrixSize
+ }
+ for i := range m {
+ if len(m[i]) != len(n[i]) {
+ return errMatrixSize
+ }
+ }
+ return nil
+}
+
+// SubMatrix returns a part of this matrix. Data is copied.
+func (m matrix) SubMatrix(rmin, cmin, rmax, cmax int) (matrix, error) {
+ result, err := newMatrix(rmax-rmin, cmax-cmin)
+ if err != nil {
+ return nil, err
+ }
+ // OPTME: If used heavily, use copy function to copy slice
+ for r := rmin; r < rmax; r++ {
+ for c := cmin; c < cmax; c++ {
+ result[r-rmin][c-cmin] = m[r][c]
+ }
+ }
+ return result, nil
+}
+
+// SwapRows Exchanges two rows in the matrix.
+func (m matrix) SwapRows(r1, r2 int) error {
+ if r1 < 0 || len(m) <= r1 || r2 < 0 || len(m) <= r2 {
+ return errInvalidRowSize
+ }
+ m[r2], m[r1] = m[r1], m[r2]
+ return nil
+}
+
+// IsSquare will return true if the matrix is square, otherwise false.
+func (m matrix) IsSquare() bool {
+ return len(m) == len(m[0])
+}
+
+// errSingular is returned if the matrix is singular and cannot be inversed
+var errSingular = errors.New("matrix is singular")
+
+// errNotSquare is returned if attempting to inverse a non-square matrix.
+var errNotSquare = errors.New("only square matrices can be inverted")
+
+// Invert returns the inverse of this matrix.
+// Returns ErrSingular when the matrix is singular and doesn't have an inverse.
+// The matrix must be square, otherwise ErrNotSquare is returned.
+func (m matrix) Invert() (matrix, error) {
+ if !m.IsSquare() {
+ return nil, errNotSquare
+ }
+
+ size := len(m)
+ work, _ := identityMatrix(size)
+ work, _ = m.Augment(work)
+
+ err := work.gaussianElimination()
+ if err != nil {
+ return nil, err
+ }
+
+ return work.SubMatrix(0, size, size, size*2)
+}
+
+func (m matrix) gaussianElimination() error {
+ rows := len(m)
+ columns := len(m[0])
+ // Clear out the part below the main diagonal and scale the main
+ // diagonal to be 1.
+ for r := 0; r < rows; r++ {
+ // If the element on the diagonal is 0, find a row below
+ // that has a non-zero and swap them.
+ if m[r][r] == 0 {
+ for rowBelow := r + 1; rowBelow < rows; rowBelow++ {
+ if m[rowBelow][r] != 0 {
+ err := m.SwapRows(r, rowBelow)
+ if err != nil {
+ return err
+ }
+ break
+ }
+ }
+ }
+ // If we couldn't find one, the matrix is singular.
+ if m[r][r] == 0 {
+ return errSingular
+ }
+ // Scale to 1.
+ if m[r][r] != 1 {
+ scale := galOneOver(m[r][r])
+ for c := 0; c < columns; c++ {
+ m[r][c] = galMultiply(m[r][c], scale)
+ }
+ }
+ // Make everything below the 1 be a 0 by subtracting
+ // a multiple of it. (Subtraction and addition are
+ // both exclusive or in the Galois field.)
+ for rowBelow := r + 1; rowBelow < rows; rowBelow++ {
+ if m[rowBelow][r] != 0 {
+ scale := m[rowBelow][r]
+ for c := 0; c < columns; c++ {
+ m[rowBelow][c] ^= galMultiply(scale, m[r][c])
+ }
+ }
+ }
+ }
+
+ // Now clear the part above the main diagonal.
+ for d := 0; d < rows; d++ {
+ for rowAbove := 0; rowAbove < d; rowAbove++ {
+ if m[rowAbove][d] != 0 {
+ scale := m[rowAbove][d]
+ for c := 0; c < columns; c++ {
+ m[rowAbove][c] ^= galMultiply(scale, m[d][c])
+ }
+
+ }
+ }
+ }
+ return nil
+}
+
+// Create a Vandermonde matrix, which is guaranteed to have the
+// property that any subset of rows that forms a square matrix
+// is invertible.
+func vandermonde(rows, cols int) (matrix, error) {
+ result, err := newMatrix(rows, cols)
+ if err != nil {
+ return nil, err
+ }
+ for r, row := range result {
+ for c := range row {
+ result[r][c] = galExp(byte(r), c)
+ }
+ }
+ return result, nil
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/options.go b/vendor/github.com/klauspost/reedsolomon/options.go
new file mode 100644
index 000000000..73cc7d6d2
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/options.go
@@ -0,0 +1,323 @@
+package reedsolomon
+
+import (
+ "runtime"
+ "strings"
+
+ "github.com/klauspost/cpuid/v2"
+)
+
+// Option allows to override processing parameters.
+type Option func(*options)
+
+type options struct {
+ maxGoroutines int
+ minSplitSize int
+ shardSize int
+ perRound int
+
+ useAvxGNFI,
+ useAvx512GFNI,
+ useAVX512,
+ useAVX2,
+ useSSSE3,
+ useSSE2 bool
+
+ useJerasureMatrix bool
+ usePAR1Matrix bool
+ useCauchy bool
+ fastOneParity bool
+ inversionCache bool
+ forcedInversionCache bool
+ customMatrix [][]byte
+ withLeopard leopardMode
+
+ // stream options
+ concReads bool
+ concWrites bool
+ streamBS int
+}
+
+var defaultOptions = options{
+ maxGoroutines: 384,
+ minSplitSize: -1,
+ fastOneParity: false,
+ inversionCache: true,
+
+ // Detect CPU capabilities.
+ useSSSE3: cpuid.CPU.Supports(cpuid.SSSE3),
+ useSSE2: cpuid.CPU.Supports(cpuid.SSE2),
+ useAVX2: cpuid.CPU.Supports(cpuid.AVX2),
+ useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL),
+ useAvx512GFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ),
+ useAvxGNFI: cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI),
+}
+
+// leopardMode controls the use of leopard GF in encoding and decoding.
+type leopardMode int
+
+const (
+ // leopardAsNeeded only switches to leopard 16-bit when there are more than
+ // 256 shards.
+ leopardAsNeeded leopardMode = iota
+ // leopardGF16 uses leopard in 16-bit mode for all shard counts.
+ leopardGF16
+ // leopardAlways uses 8-bit leopard for shards less than or equal to 256,
+ // 16-bit leopard otherwise.
+ leopardAlways
+)
+
+func init() {
+ if runtime.GOMAXPROCS(0) <= 1 {
+ defaultOptions.maxGoroutines = 1
+ }
+}
+
+// WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding.
+// Jobs will be split into this many parts, unless each goroutine would have to process
+// less than minSplitSize bytes (set with WithMinSplitSize).
+// For the best speed, keep this well above the GOMAXPROCS number for more fine grained
+// scheduling.
+// If n <= 0, it is ignored.
+func WithMaxGoroutines(n int) Option {
+ return func(o *options) {
+ if n > 0 {
+ o.maxGoroutines = n
+ }
+ }
+}
+
+// WithAutoGoroutines will adjust the number of goroutines for optimal speed with a
+// specific shard size.
+// Send in the shard size you expect to send. Other shard sizes will work, but may not
+// run at the optimal speed.
+// Overwrites WithMaxGoroutines.
+// If shardSize <= 0, it is ignored.
+func WithAutoGoroutines(shardSize int) Option {
+ return func(o *options) {
+ o.shardSize = shardSize
+ }
+}
+
+// WithMinSplitSize is the minimum encoding size in bytes per goroutine.
+// By default this parameter is determined by CPU cache characteristics.
+// See WithMaxGoroutines on how jobs are split.
+// If n <= 0, it is ignored.
+func WithMinSplitSize(n int) Option {
+ return func(o *options) {
+ if n > 0 {
+ o.minSplitSize = n
+ }
+ }
+}
+
+// WithConcurrentStreams will enable concurrent reads and writes on the streams.
+// Default: Disabled, meaning only one stream will be read/written at the time.
+// Ignored if not used on a stream input.
+func WithConcurrentStreams(enabled bool) Option {
+ return func(o *options) {
+ o.concReads, o.concWrites = enabled, enabled
+ }
+}
+
+// WithConcurrentStreamReads will enable concurrent reads from the input streams.
+// Default: Disabled, meaning only one stream will be read at the time.
+// Ignored if not used on a stream input.
+func WithConcurrentStreamReads(enabled bool) Option {
+ return func(o *options) {
+ o.concReads = enabled
+ }
+}
+
+// WithConcurrentStreamWrites will enable concurrent writes to the the output streams.
+// Default: Disabled, meaning only one stream will be written at the time.
+// Ignored if not used on a stream input.
+func WithConcurrentStreamWrites(enabled bool) Option {
+ return func(o *options) {
+ o.concWrites = enabled
+ }
+}
+
+// WithInversionCache allows to control the inversion cache.
+// This will cache reconstruction matrices so they can be reused.
+// Enabled by default, or <= 64 shards for Leopard encoding.
+func WithInversionCache(enabled bool) Option {
+ return func(o *options) {
+ o.inversionCache = enabled
+ o.forcedInversionCache = true
+ }
+}
+
+// WithStreamBlockSize allows to set a custom block size per round of reads/writes.
+// If not set, any shard size set with WithAutoGoroutines will be used.
+// If WithAutoGoroutines is also unset, 4MB will be used.
+// Ignored if not used on stream.
+func WithStreamBlockSize(n int) Option {
+ return func(o *options) {
+ o.streamBS = n
+ }
+}
+
+// WithSSSE3 allows to enable/disable SSSE3 instructions.
+// If not set, SSSE3 will be turned on or off automatically based on CPU ID information.
+func WithSSSE3(enabled bool) Option {
+ return func(o *options) {
+ o.useSSSE3 = enabled
+ }
+}
+
+// WithAVX2 allows to enable/disable AVX2 instructions.
+// If not set, AVX will be turned on or off automatically based on CPU ID information.
+// This will also disable AVX GFNI instructions.
+func WithAVX2(enabled bool) Option {
+ return func(o *options) {
+ o.useAVX2 = enabled
+ if o.useAvxGNFI {
+ o.useAvxGNFI = enabled
+ }
+ }
+}
+
+// WithSSE2 allows to enable/disable SSE2 instructions.
+// If not set, SSE2 will be turned on or off automatically based on CPU ID information.
+func WithSSE2(enabled bool) Option {
+ return func(o *options) {
+ o.useSSE2 = enabled
+ }
+}
+
+// WithAVX512 allows to enable/disable AVX512 (and GFNI) instructions.
+func WithAVX512(enabled bool) Option {
+ return func(o *options) {
+ o.useAVX512 = enabled
+ o.useAvx512GFNI = enabled
+ }
+}
+
+// WithGFNI allows to enable/disable AVX512+GFNI instructions.
+// If not set, GFNI will be turned on or off automatically based on CPU ID information.
+func WithGFNI(enabled bool) Option {
+ return func(o *options) {
+ o.useAvx512GFNI = enabled
+ }
+}
+
+// WithAVXGFNI allows to enable/disable GFNI with AVX instructions.
+// If not set, GFNI will be turned on or off automatically based on CPU ID information.
+func WithAVXGFNI(enabled bool) Option {
+ return func(o *options) {
+ o.useAvxGNFI = enabled
+ }
+}
+
+// WithJerasureMatrix causes the encoder to build the Reed-Solomon-Vandermonde
+// matrix in the same way as done by the Jerasure library.
+// The first row and column of the coding matrix only contains 1's in this method
+// so the first parity chunk is always equal to XOR of all data chunks.
+func WithJerasureMatrix() Option {
+ return func(o *options) {
+ o.useJerasureMatrix = true
+ o.usePAR1Matrix = false
+ o.useCauchy = false
+ }
+}
+
+// WithPAR1Matrix causes the encoder to build the matrix how PARv1
+// does. Note that the method they use is buggy, and may lead to cases
+// where recovery is impossible, even if there are enough parity
+// shards.
+func WithPAR1Matrix() Option {
+ return func(o *options) {
+ o.useJerasureMatrix = false
+ o.usePAR1Matrix = true
+ o.useCauchy = false
+ }
+}
+
+// WithCauchyMatrix will make the encoder build a Cauchy style matrix.
+// The output of this is not compatible with the standard output.
+// A Cauchy matrix is faster to generate. This does not affect data throughput,
+// but will result in slightly faster start-up time.
+func WithCauchyMatrix() Option {
+ return func(o *options) {
+ o.useJerasureMatrix = false
+ o.usePAR1Matrix = false
+ o.useCauchy = true
+ }
+}
+
+// WithFastOneParityMatrix will switch the matrix to a simple xor
+// if there is only one parity shard.
+// The PAR1 matrix already has this property so it has little effect there.
+func WithFastOneParityMatrix() Option {
+ return func(o *options) {
+ o.fastOneParity = true
+ }
+}
+
+// WithCustomMatrix causes the encoder to use the manually specified matrix.
+// customMatrix represents only the parity chunks.
+// customMatrix must have at least ParityShards rows and DataShards columns.
+// It can be used for interoperability with libraries which generate
+// the matrix differently or to implement more complex coding schemes like LRC
+// (locally reconstructible codes).
+func WithCustomMatrix(customMatrix [][]byte) Option {
+ return func(o *options) {
+ o.customMatrix = customMatrix
+ }
+}
+
+// WithLeopardGF16 will always use leopard GF16 for encoding,
+// even when there is less than 256 shards.
+// This will likely improve reconstruction time for some setups.
+// This is not compatible with Leopard output for <= 256 shards.
+// Note that Leopard places certain restrictions on use see other documentation.
+func WithLeopardGF16(enabled bool) Option {
+ return func(o *options) {
+ if enabled {
+ o.withLeopard = leopardGF16
+ } else {
+ o.withLeopard = leopardAsNeeded
+ }
+ }
+}
+
+// WithLeopardGF will use leopard GF for encoding, even when there are fewer than
+// 256 shards.
+// This will likely improve reconstruction time for some setups.
+// Note that Leopard places certain restrictions on use see other documentation.
+func WithLeopardGF(enabled bool) Option {
+ return func(o *options) {
+ if enabled {
+ o.withLeopard = leopardAlways
+ } else {
+ o.withLeopard = leopardAsNeeded
+ }
+ }
+}
+
+func (o *options) cpuOptions() string {
+ var res []string
+ if o.useSSE2 {
+ res = append(res, "SSE2")
+ }
+ if o.useAVX2 {
+ res = append(res, "AVX2")
+ }
+ if o.useSSSE3 {
+ res = append(res, "SSSE3")
+ }
+ if o.useAVX512 {
+ res = append(res, "AVX512")
+ }
+ if o.useAvx512GFNI {
+ res = append(res, "AVX512+GFNI")
+ }
+ if o.useAvxGNFI {
+ res = append(res, "AVX+GFNI")
+ }
+ if len(res) == 0 {
+ return "pure Go"
+ }
+ return strings.Join(res, ",")
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
new file mode 100644
index 000000000..bebba0445
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
@@ -0,0 +1,1741 @@
+/**
+ * Reed-Solomon Coding over 8-bit values.
+ *
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc.
+ */
+
+// Package reedsolomon enables Erasure Coding in Go
+//
+// For usage and examples, see https://github.com/klauspost/reedsolomon
+package reedsolomon
+
+import (
+ "bytes"
+ "errors"
+ "fmt"
+ "io"
+ "runtime"
+ "sync"
+
+ "github.com/klauspost/cpuid/v2"
+)
+
+// Encoder is an interface to encode Reed-Salomon parity sets for your data.
+type Encoder interface {
+ // Encode parity for a set of data shards.
+ // Input is 'shards' containing data shards followed by parity shards.
+ // The number of shards must match the number given to New().
+ // Each shard is a byte array, and they must all be the same size.
+ // The parity shards will always be overwritten and the data shards
+ // will remain the same, so it is safe for you to read from the
+ // data shards while this is running.
+ Encode(shards [][]byte) error
+
+ // EncodeIdx will add parity for a single data shard.
+ // Parity shards should start out as 0. The caller must zero them.
+ // Data shards must be delivered exactly once. There is no check for this.
+ // The parity shards will always be updated and the data shards will remain the same.
+ EncodeIdx(dataShard []byte, idx int, parity [][]byte) error
+
+ // Verify returns true if the parity shards contain correct data.
+ // The data is the same format as Encode. No data is modified, so
+ // you are allowed to read from data while this is running.
+ Verify(shards [][]byte) (bool, error)
+
+ // Reconstruct will recreate the missing shards if possible.
+ //
+ // Given a list of shards, some of which contain data, fills in the
+ // ones that don't have data.
+ //
+ // The length of the array must be equal to the total number of shards.
+ // You indicate that a shard is missing by setting it to nil or zero-length.
+ // If a shard is zero-length but has sufficient capacity, that memory will
+ // be used, otherwise a new []byte will be allocated.
+ //
+ // If there are too few shards to reconstruct the missing
+ // ones, ErrTooFewShards will be returned.
+ //
+ // The reconstructed shard set is complete, but integrity is not verified.
+ // Use the Verify function to check if data set is ok.
+ Reconstruct(shards [][]byte) error
+
+ // ReconstructData will recreate any missing data shards, if possible.
+ //
+ // Given a list of shards, some of which contain data, fills in the
+ // data shards that don't have data.
+ //
+ // The length of the array must be equal to Shards.
+ // You indicate that a shard is missing by setting it to nil or zero-length.
+ // If a shard is zero-length but has sufficient capacity, that memory will
+ // be used, otherwise a new []byte will be allocated.
+ //
+ // If there are too few shards to reconstruct the missing
+ // ones, ErrTooFewShards will be returned.
+ //
+ // As the reconstructed shard set may contain missing parity shards,
+ // calling the Verify function is likely to fail.
+ ReconstructData(shards [][]byte) error
+
+ // ReconstructSome will recreate only requested shards, if possible.
+ //
+ // Given a list of shards, some of which contain data, fills in the
+ // shards indicated by true values in the "required" parameter.
+ // The length of the "required" array must be equal to either Shards or DataShards.
+ // If the length is equal to DataShards, the reconstruction of parity shards will be ignored.
+ //
+ // The length of "shards" array must be equal to Shards.
+ // You indicate that a shard is missing by setting it to nil or zero-length.
+ // If a shard is zero-length but has sufficient capacity, that memory will
+ // be used, otherwise a new []byte will be allocated.
+ //
+ // If there are too few shards to reconstruct the missing
+ // ones, ErrTooFewShards will be returned.
+ //
+ // As the reconstructed shard set may contain missing parity shards,
+ // calling the Verify function is likely to fail.
+ ReconstructSome(shards [][]byte, required []bool) error
+
+ // Update parity is use for change a few data shards and update it's parity.
+ // Input 'newDatashards' containing data shards changed.
+ // Input 'shards' containing old data shards (if data shard not changed, it can be nil) and old parity shards.
+ // new parity shards will in shards[DataShards:]
+ // Update is very useful if DataShards much larger than ParityShards and changed data shards is few. It will
+ // faster than Encode and not need read all data shards to encode.
+ Update(shards [][]byte, newDatashards [][]byte) error
+
+ // Split a data slice into the number of shards given to the encoder,
+ // and create empty parity shards if necessary.
+ //
+ // The data will be split into equally sized shards.
+ // If the data size isn't divisible by the number of shards,
+ // the last shard will contain extra zeros.
+ //
+ // If there is extra capacity on the provided data slice
+ // it will be used instead of allocating parity shards.
+ // It will be zeroed out.
+ //
+ // There must be at least 1 byte otherwise ErrShortData will be
+ // returned.
+ //
+ // The data will not be copied, except for the last shard, so you
+ // should not modify the data of the input slice afterwards.
+ Split(data []byte) ([][]byte, error)
+
+ // Join the shards and write the data segment to dst.
+ //
+ // Only the data shards are considered.
+ // You must supply the exact output size you want.
+ // If there are to few shards given, ErrTooFewShards will be returned.
+ // If the total data size is less than outSize, ErrShortData will be returned.
+ Join(dst io.Writer, shards [][]byte, outSize int) error
+}
+
+// Extensions is an optional interface.
+// All returned instances will support this interface.
+type Extensions interface {
+ // ShardSizeMultiple will return the size the shard sizes must be a multiple of.
+ ShardSizeMultiple() int
+
+ // DataShards will return the number of data shards.
+ DataShards() int
+
+ // ParityShards will return the number of parity shards.
+ ParityShards() int
+
+ // TotalShards will return the total number of shards.
+ TotalShards() int
+
+ // AllocAligned will allocate TotalShards number of slices,
+ // aligned to reasonable memory sizes.
+ // Provide the size of each shard.
+ AllocAligned(each int) [][]byte
+}
+
+const (
+ avx2CodeGenMinSize = 64
+ avx2CodeGenMinShards = 3
+ avx2CodeGenMaxGoroutines = 8
+ gfniCodeGenMaxGoroutines = 4
+
+ intSize = 32 << (^uint(0) >> 63) // 32 or 64
+ maxInt = 1<<(intSize-1) - 1
+)
+
+// reedSolomon contains a matrix for a specific
+// distribution of datashards and parity shards.
+// Construct if using New()
+type reedSolomon struct {
+ dataShards int // Number of data shards, should not be modified.
+ parityShards int // Number of parity shards, should not be modified.
+ totalShards int // Total number of shards. Calculated, and should not be modified.
+ m matrix
+ tree *inversionTree
+ parity [][]byte
+ o options
+ mPoolSz int
+ mPool sync.Pool // Pool for temp matrices, etc
+}
+
+var _ = Extensions(&reedSolomon{})
+
+func (r *reedSolomon) ShardSizeMultiple() int {
+ return 1
+}
+
+func (r *reedSolomon) DataShards() int {
+ return r.dataShards
+}
+
+func (r *reedSolomon) ParityShards() int {
+ return r.parityShards
+}
+
+func (r *reedSolomon) TotalShards() int {
+ return r.totalShards
+}
+
+func (r *reedSolomon) AllocAligned(each int) [][]byte {
+ return AllocAligned(r.totalShards, each)
+}
+
+// ErrInvShardNum will be returned by New, if you attempt to create
+// an Encoder with less than one data shard or less than zero parity
+// shards.
+var ErrInvShardNum = errors.New("cannot create Encoder with less than one data shard or less than zero parity shards")
+
+// ErrMaxShardNum will be returned by New, if you attempt to create an
+// Encoder where data and parity shards are bigger than the order of
+// GF(2^8).
+var ErrMaxShardNum = errors.New("cannot create Encoder with more than 256 data+parity shards")
+
+// ErrNotSupported is returned when an operation is not supported.
+var ErrNotSupported = errors.New("operation not supported")
+
+// buildMatrix creates the matrix to use for encoding, given the
+// number of data shards and the number of total shards.
+//
+// The top square of the matrix is guaranteed to be an identity
+// matrix, which means that the data shards are unchanged after
+// encoding.
+func buildMatrix(dataShards, totalShards int) (matrix, error) {
+ // Start with a Vandermonde matrix. This matrix would work,
+ // in theory, but doesn't have the property that the data
+ // shards are unchanged after encoding.
+ vm, err := vandermonde(totalShards, dataShards)
+ if err != nil {
+ return nil, err
+ }
+
+ // Multiply by the inverse of the top square of the matrix.
+ // This will make the top square be the identity matrix, but
+ // preserve the property that any square subset of rows is
+ // invertible.
+ top, err := vm.SubMatrix(0, 0, dataShards, dataShards)
+ if err != nil {
+ return nil, err
+ }
+
+ topInv, err := top.Invert()
+ if err != nil {
+ return nil, err
+ }
+
+ return vm.Multiply(topInv)
+}
+
+// buildMatrixJerasure creates the same encoding matrix as Jerasure library
+//
+// The top square of the matrix is guaranteed to be an identity
+// matrix, which means that the data shards are unchanged after
+// encoding.
+func buildMatrixJerasure(dataShards, totalShards int) (matrix, error) {
+ // Start with a Vandermonde matrix. This matrix would work,
+ // in theory, but doesn't have the property that the data
+ // shards are unchanged after encoding.
+ vm, err := vandermonde(totalShards, dataShards)
+ if err != nil {
+ return nil, err
+ }
+
+ // Jerasure does this:
+ // first row is always 100..00
+ vm[0][0] = 1
+ for i := 1; i < dataShards; i++ {
+ vm[0][i] = 0
+ }
+ // last row is always 000..01
+ for i := 0; i < dataShards-1; i++ {
+ vm[totalShards-1][i] = 0
+ }
+ vm[totalShards-1][dataShards-1] = 1
+
+ for i := 0; i < dataShards; i++ {
+ // Find the row where i'th col is not 0
+ r := i
+ for ; r < totalShards && vm[r][i] == 0; r++ {
+ }
+ if r != i {
+ // Swap it with i'th row if not already
+ t := vm[r]
+ vm[r] = vm[i]
+ vm[i] = t
+ }
+ // Multiply by the inverted matrix (same as vm.Multiply(vm[0:dataShards].Invert()))
+ if vm[i][i] != 1 {
+ // Make vm[i][i] = 1 by dividing the column by vm[i][i]
+ tmp := galOneOver(vm[i][i])
+ for j := 0; j < totalShards; j++ {
+ vm[j][i] = galMultiply(vm[j][i], tmp)
+ }
+ }
+ for j := 0; j < dataShards; j++ {
+ // Make vm[i][j] = 0 where j != i by adding vm[i][j]*vm[.][i] to each column
+ tmp := vm[i][j]
+ if j != i && tmp != 0 {
+ for r := 0; r < totalShards; r++ {
+ vm[r][j] = galAdd(vm[r][j], galMultiply(tmp, vm[r][i]))
+ }
+ }
+ }
+ }
+
+ // Make vm[dataShards] row all ones - divide each column j by vm[dataShards][j]
+ for j := 0; j < dataShards; j++ {
+ tmp := vm[dataShards][j]
+ if tmp != 1 {
+ tmp = galOneOver(tmp)
+ for i := dataShards; i < totalShards; i++ {
+ vm[i][j] = galMultiply(vm[i][j], tmp)
+ }
+ }
+ }
+
+ // Make vm[dataShards...totalShards-1][0] column all ones - divide each row
+ for i := dataShards + 1; i < totalShards; i++ {
+ tmp := vm[i][0]
+ if tmp != 1 {
+ tmp = galOneOver(tmp)
+ for j := 0; j < dataShards; j++ {
+ vm[i][j] = galMultiply(vm[i][j], tmp)
+ }
+ }
+ }
+
+ return vm, nil
+}
+
+// buildMatrixPAR1 creates the matrix to use for encoding according to
+// the PARv1 spec, given the number of data shards and the number of
+// total shards. Note that the method they use is buggy, and may lead
+// to cases where recovery is impossible, even if there are enough
+// parity shards.
+//
+// The top square of the matrix is guaranteed to be an identity
+// matrix, which means that the data shards are unchanged after
+// encoding.
+func buildMatrixPAR1(dataShards, totalShards int) (matrix, error) {
+ result, err := newMatrix(totalShards, dataShards)
+ if err != nil {
+ return nil, err
+ }
+
+ for r, row := range result {
+ // The top portion of the matrix is the identity
+ // matrix, and the bottom is a transposed Vandermonde
+ // matrix starting at 1 instead of 0.
+ if r < dataShards {
+ result[r][r] = 1
+ } else {
+ for c := range row {
+ result[r][c] = galExp(byte(c+1), r-dataShards)
+ }
+ }
+ }
+ return result, nil
+}
+
+func buildMatrixCauchy(dataShards, totalShards int) (matrix, error) {
+ result, err := newMatrix(totalShards, dataShards)
+ if err != nil {
+ return nil, err
+ }
+
+ for r, row := range result {
+ // The top portion of the matrix is the identity
+ // matrix, and the bottom is a transposed Cauchy matrix.
+ if r < dataShards {
+ result[r][r] = 1
+ } else {
+ for c := range row {
+ result[r][c] = invTable[(byte(r ^ c))]
+ }
+ }
+ }
+ return result, nil
+}
+
+// buildXorMatrix can be used to build a matrix with pure XOR
+// operations if there is only one parity shard.
+func buildXorMatrix(dataShards, totalShards int) (matrix, error) {
+ if dataShards+1 != totalShards {
+ return nil, errors.New("internal error")
+ }
+ result, err := newMatrix(totalShards, dataShards)
+ if err != nil {
+ return nil, err
+ }
+
+ for r, row := range result {
+ // The top portion of the matrix is the identity
+ // matrix.
+ if r < dataShards {
+ result[r][r] = 1
+ } else {
+ // Set all values to 1 (XOR)
+ for c := range row {
+ result[r][c] = 1
+ }
+ }
+ }
+ return result, nil
+}
+
+// New creates a new encoder and initializes it to
+// the number of data shards and parity shards that
+// you want to use. You can reuse this encoder.
+// Note that the maximum number of total shards is 65536, with some
+// restrictions for a total larger than 256:
+//
+// - Shard sizes must be multiple of 64
+// - The methods Join/Split/Update/EncodeIdx are not supported
+//
+// If no options are supplied, default options are used.
+func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
+ o := defaultOptions
+ for _, opt := range opts {
+ opt(&o)
+ }
+
+ totShards := dataShards + parityShards
+ switch {
+ case o.withLeopard == leopardGF16 && parityShards > 0 || totShards > 256:
+ return newFF16(dataShards, parityShards, o)
+ case o.withLeopard == leopardAlways && parityShards > 0:
+ return newFF8(dataShards, parityShards, o)
+ }
+ if totShards > 256 {
+ return nil, ErrMaxShardNum
+ }
+
+ r := reedSolomon{
+ dataShards: dataShards,
+ parityShards: parityShards,
+ totalShards: dataShards + parityShards,
+ o: o,
+ }
+
+ if dataShards <= 0 || parityShards < 0 {
+ return nil, ErrInvShardNum
+ }
+
+ if parityShards == 0 {
+ return &r, nil
+ }
+
+ var err error
+ switch {
+ case r.o.customMatrix != nil:
+ if len(r.o.customMatrix) < parityShards {
+ return nil, errors.New("coding matrix must contain at least parityShards rows")
+ }
+ r.m = make([][]byte, r.totalShards)
+ for i := 0; i < dataShards; i++ {
+ r.m[i] = make([]byte, dataShards)
+ r.m[i][i] = 1
+ }
+ for k, row := range r.o.customMatrix {
+ if len(row) < dataShards {
+ return nil, errors.New("coding matrix must contain at least dataShards columns")
+ }
+ r.m[dataShards+k] = make([]byte, dataShards)
+ copy(r.m[dataShards+k], row)
+ }
+ case r.o.fastOneParity && parityShards == 1:
+ r.m, err = buildXorMatrix(dataShards, r.totalShards)
+ case r.o.useCauchy:
+ r.m, err = buildMatrixCauchy(dataShards, r.totalShards)
+ case r.o.usePAR1Matrix:
+ r.m, err = buildMatrixPAR1(dataShards, r.totalShards)
+ case r.o.useJerasureMatrix:
+ r.m, err = buildMatrixJerasure(dataShards, r.totalShards)
+ default:
+ r.m, err = buildMatrix(dataShards, r.totalShards)
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // Calculate what we want per round
+ r.o.perRound = cpuid.CPU.Cache.L2
+ if r.o.perRound < 128<<10 {
+ r.o.perRound = 128 << 10
+ }
+
+ divide := parityShards + 1
+ if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) {
+ // Base on L1 cache if we have many inputs.
+ r.o.perRound = cpuid.CPU.Cache.L1D
+ if r.o.perRound < 32<<10 {
+ r.o.perRound = 32 << 10
+ }
+ divide = 0
+ if dataShards > maxAvx2Inputs {
+ divide += maxAvx2Inputs
+ } else {
+ divide += dataShards
+ }
+ if parityShards > maxAvx2Inputs {
+ divide += maxAvx2Outputs
+ } else {
+ divide += parityShards
+ }
+ }
+
+ if cpuid.CPU.ThreadsPerCore > 1 && r.o.maxGoroutines > cpuid.CPU.PhysicalCores {
+ // If multiple threads per core, make sure they don't contend for cache.
+ r.o.perRound /= cpuid.CPU.ThreadsPerCore
+ }
+
+ // 1 input + parity must fit in cache, and we add one more to be safer.
+ r.o.perRound = r.o.perRound / divide
+ // Align to 64 bytes.
+ r.o.perRound = ((r.o.perRound + 63) / 64) * 64
+
+ // Final sanity check...
+ if r.o.perRound < 1<<10 {
+ r.o.perRound = 1 << 10
+ }
+
+ if r.o.minSplitSize <= 0 {
+ // Set minsplit as high as we can, but still have parity in L1.
+ cacheSize := cpuid.CPU.Cache.L1D
+ if cacheSize <= 0 {
+ cacheSize = 32 << 10
+ }
+
+ r.o.minSplitSize = cacheSize / (parityShards + 1)
+ // Min 1K
+ if r.o.minSplitSize < 1024 {
+ r.o.minSplitSize = 1024
+ }
+ }
+
+ if r.o.shardSize > 0 {
+ p := runtime.GOMAXPROCS(0)
+ if p == 1 || r.o.shardSize <= r.o.minSplitSize*2 {
+ // Not worth it.
+ r.o.maxGoroutines = 1
+ } else {
+ g := r.o.shardSize / r.o.perRound
+
+ // Overprovision by a factor of 2.
+ if g < p*2 && r.o.perRound > r.o.minSplitSize*2 {
+ g = p * 2
+ r.o.perRound /= 2
+ }
+
+ // Have g be multiple of p
+ g += p - 1
+ g -= g % p
+
+ r.o.maxGoroutines = g
+ }
+ }
+
+ // Generated AVX2 does not need data to stay in L1 cache between runs.
+ // We will be purely limited by RAM speed.
+ if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines {
+ r.o.maxGoroutines = avx2CodeGenMaxGoroutines
+ }
+
+ if r.canGFNI(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > gfniCodeGenMaxGoroutines {
+ r.o.maxGoroutines = gfniCodeGenMaxGoroutines
+ }
+
+ // Inverted matrices are cached in a tree keyed by the indices
+ // of the invalid rows of the data to reconstruct.
+ // The inversion root node will have the identity matrix as
+ // its inversion matrix because it implies there are no errors
+ // with the original data.
+ if r.o.inversionCache {
+ r.tree = newInversionTree(dataShards, parityShards)
+ }
+
+ r.parity = make([][]byte, parityShards)
+ for i := range r.parity {
+ r.parity[i] = r.m[dataShards+i]
+ }
+
+ if avx2CodeGen && r.o.useAVX2 {
+ sz := r.dataShards * r.parityShards * 2 * 32
+ r.mPool.New = func() interface{} {
+ return AllocAligned(1, sz)[0]
+ }
+ r.mPoolSz = sz
+ }
+ return &r, err
+}
+
+func (r *reedSolomon) getTmpSlice() []byte {
+ return r.mPool.Get().([]byte)
+}
+
+func (r *reedSolomon) putTmpSlice(b []byte) {
+ if b != nil && cap(b) >= r.mPoolSz {
+ r.mPool.Put(b[:r.mPoolSz])
+ return
+ }
+ if false {
+ // Sanity check
+ panic(fmt.Sprintf("got short tmp returned, want %d, got %d", r.mPoolSz, cap(b)))
+ }
+}
+
+// ErrTooFewShards is returned if too few shards where given to
+// Encode/Verify/Reconstruct/Update. It will also be returned from Reconstruct
+// if there were too few shards to reconstruct the missing data.
+var ErrTooFewShards = errors.New("too few shards given")
+
+// Encode parity for a set of data shards.
+// An array 'shards' containing data shards followed by parity shards.
+// The number of shards must match the number given to New.
+// Each shard is a byte array, and they must all be the same size.
+// The parity shards will always be overwritten and the data shards
+// will remain the same.
+func (r *reedSolomon) Encode(shards [][]byte) error {
+ if len(shards) != r.totalShards {
+ return ErrTooFewShards
+ }
+
+ err := checkShards(shards, false)
+ if err != nil {
+ return err
+ }
+
+ // Get the slice of output buffers.
+ output := shards[r.dataShards:]
+
+ // Do the coding.
+ r.codeSomeShards(r.parity, shards[0:r.dataShards], output[:r.parityShards], len(shards[0]))
+ return nil
+}
+
+// EncodeIdx will add parity for a single data shard.
+// Parity shards should start out zeroed. The caller must zero them before first call.
+// Data shards should only be delivered once. There is no check for this.
+// The parity shards will always be updated and the data shards will remain the unchanged.
+func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) error {
+ if len(parity) != r.parityShards {
+ return ErrTooFewShards
+ }
+ if len(parity) == 0 {
+ return nil
+ }
+ if idx < 0 || idx >= r.dataShards {
+ return ErrInvShardNum
+ }
+ err := checkShards(parity, false)
+ if err != nil {
+ return err
+ }
+ if len(parity[0]) != len(dataShard) {
+ return ErrShardSize
+ }
+
+ if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useAvx512GFNI || r.o.useAvxGNFI) {
+ m := make([][]byte, r.parityShards)
+ for iRow := range m {
+ m[iRow] = r.parity[iRow][idx : idx+1]
+ }
+ if r.o.useAvx512GFNI || r.o.useAvxGNFI {
+ r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false)
+ } else {
+ r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false)
+ }
+ return nil
+ }
+
+ // Process using no goroutines for now.
+ start, end := 0, r.o.perRound
+ if end > len(dataShard) {
+ end = len(dataShard)
+ }
+
+ for start < len(dataShard) {
+ in := dataShard[start:end]
+ for iRow := 0; iRow < r.parityShards; iRow++ {
+ galMulSliceXor(r.parity[iRow][idx], in, parity[iRow][start:end], &r.o)
+ }
+ start = end
+ end += r.o.perRound
+ if end > len(dataShard) {
+ end = len(dataShard)
+ }
+ }
+ return nil
+}
+
+// ErrInvalidInput is returned if invalid input parameter of Update.
+var ErrInvalidInput = errors.New("invalid input")
+
+func (r *reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error {
+ if len(shards) != r.totalShards {
+ return ErrTooFewShards
+ }
+
+ if len(newDatashards) != r.dataShards {
+ return ErrTooFewShards
+ }
+
+ err := checkShards(shards, true)
+ if err != nil {
+ return err
+ }
+
+ err = checkShards(newDatashards, true)
+ if err != nil {
+ return err
+ }
+
+ for i := range newDatashards {
+ if newDatashards[i] != nil && shards[i] == nil {
+ return ErrInvalidInput
+ }
+ }
+ for _, p := range shards[r.dataShards:] {
+ if p == nil {
+ return ErrInvalidInput
+ }
+ }
+
+ shardSize := shardSize(shards)
+
+ // Get the slice of output buffers.
+ output := shards[r.dataShards:]
+
+ // Do the coding.
+ r.updateParityShards(r.parity, shards[0:r.dataShards], newDatashards[0:r.dataShards], output, r.parityShards, shardSize)
+ return nil
+}
+
+func (r *reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
+ if len(outputs) == 0 {
+ return
+ }
+
+ if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
+ r.updateParityShardsP(matrixRows, oldinputs, newinputs, outputs, outputCount, byteCount)
+ return
+ }
+
+ for c := 0; c < r.dataShards; c++ {
+ in := newinputs[c]
+ if in == nil {
+ continue
+ }
+ oldin := oldinputs[c]
+ // oldinputs data will be changed
+ sliceXor(in, oldin, &r.o)
+ for iRow := 0; iRow < outputCount; iRow++ {
+ galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], &r.o)
+ }
+ }
+}
+
+func (r *reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
+ var wg sync.WaitGroup
+ do := byteCount / r.o.maxGoroutines
+ if do < r.o.minSplitSize {
+ do = r.o.minSplitSize
+ }
+ start := 0
+ for start < byteCount {
+ if start+do > byteCount {
+ do = byteCount - start
+ }
+ wg.Add(1)
+ go func(start, stop int) {
+ for c := 0; c < r.dataShards; c++ {
+ in := newinputs[c]
+ if in == nil {
+ continue
+ }
+ oldin := oldinputs[c]
+ // oldinputs data will be change
+ sliceXor(in[start:stop], oldin[start:stop], &r.o)
+ for iRow := 0; iRow < outputCount; iRow++ {
+ galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], &r.o)
+ }
+ }
+ wg.Done()
+ }(start, start+do)
+ start += do
+ }
+ wg.Wait()
+}
+
+// Verify returns true if the parity shards contain the right data.
+// The data is the same format as Encode. No data is modified.
+func (r *reedSolomon) Verify(shards [][]byte) (bool, error) {
+ if len(shards) != r.totalShards {
+ return false, ErrTooFewShards
+ }
+ err := checkShards(shards, false)
+ if err != nil {
+ return false, err
+ }
+
+ // Slice of buffers being checked.
+ toCheck := shards[r.dataShards:]
+
+ // Do the checking.
+ return r.checkSomeShards(r.parity, shards[:r.dataShards], toCheck[:r.parityShards], len(shards[0])), nil
+}
+
+func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool {
+ return avx2CodeGen && pshufb && r.o.useAVX2 &&
+ byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards &&
+ inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs
+}
+
+func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool {
+ return avx2CodeGen && (r.o.useAvx512GFNI || r.o.useAvxGNFI) &&
+ byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards &&
+ inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs
+}
+
+// Multiplies a subset of rows from a coding matrix by a full set of
+// input totalShards to produce some output totalShards.
+// 'matrixRows' is The rows from the matrix to use.
+// 'inputs' An array of byte arrays, each of which is one input shard.
+// The number of inputs used is determined by the length of each matrix row.
+// outputs Byte arrays where the computed totalShards are stored.
+// The number of outputs computed, and the
+// number of matrix rows used, is determined by
+// outputCount, which is the number of outputs to compute.
+func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteCount int) {
+ if len(outputs) == 0 {
+ return
+ }
+ if byteCount > r.o.minSplitSize {
+ r.codeSomeShardsP(matrixRows, inputs, outputs, byteCount)
+ return
+ }
+
+ // Process using no goroutines
+ start, end := 0, r.o.perRound
+ if end > len(inputs[0]) {
+ end = len(inputs[0])
+ }
+ if r.canGFNI(byteCount, len(inputs), len(outputs)) {
+ var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64
+ m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:])
+ if r.o.useAvx512GFNI {
+ start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount)
+ } else {
+ start += galMulSlicesAvxGFNI(m, inputs, outputs, 0, byteCount)
+ }
+ end = len(inputs[0])
+ } else if r.canAVX2C(byteCount, len(inputs), len(outputs)) {
+ m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
+ start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount)
+ r.putTmpSlice(m)
+ end = len(inputs[0])
+ } else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) {
+ var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64
+ end = len(inputs[0])
+ inIdx := 0
+ m := r.getTmpSlice()
+ defer r.putTmpSlice(m)
+ ins := inputs
+ for len(ins) > 0 {
+ inPer := ins
+ if len(inPer) > maxAvx2Inputs {
+ inPer = inPer[:maxAvx2Inputs]
+ }
+ outs := outputs
+ outIdx := 0
+ for len(outs) > 0 {
+ outPer := outs
+ if len(outPer) > maxAvx2Outputs {
+ outPer = outPer[:maxAvx2Outputs]
+ }
+ if r.o.useAvx512GFNI {
+ m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:])
+ if inIdx == 0 {
+ start = galMulSlicesGFNI(m, inPer, outPer, 0, byteCount)
+ } else {
+ start = galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount)
+ }
+ } else if r.o.useAvxGNFI {
+ m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:])
+ if inIdx == 0 {
+ start = galMulSlicesAvxGFNI(m, inPer, outPer, 0, byteCount)
+ } else {
+ start = galMulSlicesAvxGFNIXor(m, inPer, outPer, 0, byteCount)
+ }
+ } else {
+ m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m)
+ if inIdx == 0 {
+ start = galMulSlicesAvx2(m, inPer, outPer, 0, byteCount)
+ } else {
+ start = galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount)
+ }
+ }
+ outIdx += len(outPer)
+ outs = outs[len(outPer):]
+ }
+ inIdx += len(inPer)
+ ins = ins[len(inPer):]
+ }
+ if start >= end {
+ return
+ }
+ }
+ for start < len(inputs[0]) {
+ for c := 0; c < len(inputs); c++ {
+ in := inputs[c][start:end]
+ for iRow := 0; iRow < len(outputs); iRow++ {
+ if c == 0 {
+ galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o)
+ } else {
+ galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o)
+ }
+ }
+ }
+ start = end
+ end += r.o.perRound
+ if end > len(inputs[0]) {
+ end = len(inputs[0])
+ }
+ }
+}
+
+// Perform the same as codeSomeShards, but split the workload into
+// several goroutines.
+func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byteCount int) {
+ var wg sync.WaitGroup
+ gor := r.o.maxGoroutines
+
+ var avx2Matrix []byte
+ var gfniMatrix []uint64
+ useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs))
+ useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs))
+ if useGFNI {
+ var tmp [maxAvx2Inputs * maxAvx2Outputs]uint64
+ gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:])
+ } else if useAvx2 {
+ avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
+ defer r.putTmpSlice(avx2Matrix)
+ } else if (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
+ r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
+ // It appears there is a switchover point at around 10MB where
+ // Regular processing is faster...
+ r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true)
+ return
+ } else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
+ r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
+ // It appears there is a switchover point at around 10MB where
+ // Regular processing is faster...
+ r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true)
+ return
+ }
+
+ do := byteCount / gor
+ if do < r.o.minSplitSize {
+ do = r.o.minSplitSize
+ }
+
+ exec := func(start, stop int) {
+ if stop-start >= 64 {
+ if useGFNI {
+ if r.o.useAvx512GFNI {
+ start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop)
+ } else {
+ start += galMulSlicesAvxGFNI(gfniMatrix, inputs, outputs, start, stop)
+ }
+ } else if useAvx2 {
+ start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
+ }
+ }
+
+ lstart, lstop := start, start+r.o.perRound
+ if lstop > stop {
+ lstop = stop
+ }
+ for lstart < stop {
+ for c := 0; c < len(inputs); c++ {
+ in := inputs[c][lstart:lstop]
+ for iRow := 0; iRow < len(outputs); iRow++ {
+ if c == 0 {
+ galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
+ } else {
+ galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
+ }
+ }
+ }
+ lstart = lstop
+ lstop += r.o.perRound
+ if lstop > stop {
+ lstop = stop
+ }
+ }
+ wg.Done()
+ }
+ if gor <= 1 {
+ wg.Add(1)
+ exec(0, byteCount)
+ return
+ }
+
+ // Make sizes divisible by 64
+ do = (do + 63) & (^63)
+ start := 0
+ for start < byteCount {
+ if start+do > byteCount {
+ do = byteCount - start
+ }
+
+ wg.Add(1)
+ go exec(start, start+do)
+ start += do
+ }
+ wg.Wait()
+}
+
+// Perform the same as codeSomeShards, but split the workload into
+// several goroutines.
+// If clear is set, the first write will overwrite the output.
+func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) {
+ var wg sync.WaitGroup
+ gor := r.o.maxGoroutines
+
+ type state struct {
+ input [][]byte
+ output [][]byte
+ m []byte
+ first bool
+ }
+ // Make a plan...
+ plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs))
+
+ tmp := r.getTmpSlice()
+ defer r.putTmpSlice(tmp)
+
+ // Flips between input first to output first.
+ // We put the smallest data load in the inner loop.
+ if len(inputs) > len(outputs) {
+ inIdx := 0
+ ins := inputs
+ for len(ins) > 0 {
+ inPer := ins
+ if len(inPer) > maxAvx2Inputs {
+ inPer = inPer[:maxAvx2Inputs]
+ }
+ outs := outputs
+ outIdx := 0
+ for len(outs) > 0 {
+ outPer := outs
+ if len(outPer) > maxAvx2Outputs {
+ outPer = outPer[:maxAvx2Outputs]
+ }
+ // Generate local matrix
+ m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
+ tmp = tmp[len(m):]
+ plan = append(plan, state{
+ input: inPer,
+ output: outPer,
+ m: m,
+ first: inIdx == 0 && clear,
+ })
+ outIdx += len(outPer)
+ outs = outs[len(outPer):]
+ }
+ inIdx += len(inPer)
+ ins = ins[len(inPer):]
+ }
+ } else {
+ outs := outputs
+ outIdx := 0
+ for len(outs) > 0 {
+ outPer := outs
+ if len(outPer) > maxAvx2Outputs {
+ outPer = outPer[:maxAvx2Outputs]
+ }
+
+ inIdx := 0
+ ins := inputs
+ for len(ins) > 0 {
+ inPer := ins
+ if len(inPer) > maxAvx2Inputs {
+ inPer = inPer[:maxAvx2Inputs]
+ }
+ // Generate local matrix
+ m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
+ tmp = tmp[len(m):]
+ //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound)
+ plan = append(plan, state{
+ input: inPer,
+ output: outPer,
+ m: m,
+ first: inIdx == 0 && clear,
+ })
+ inIdx += len(inPer)
+ ins = ins[len(inPer):]
+ }
+ outIdx += len(outPer)
+ outs = outs[len(outPer):]
+ }
+ }
+
+ do := byteCount / gor
+ if do < r.o.minSplitSize {
+ do = r.o.minSplitSize
+ }
+
+ exec := func(start, stop int) {
+ defer wg.Done()
+ lstart, lstop := start, start+r.o.perRound
+ if lstop > stop {
+ lstop = stop
+ }
+ for lstart < stop {
+ if lstop-lstart >= minAvx2Size {
+ // Execute plan...
+ var n int
+ for _, p := range plan {
+ if p.first {
+ n = galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop)
+ } else {
+ n = galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop)
+ }
+ }
+ lstart += n
+ if lstart == lstop {
+ lstop += r.o.perRound
+ if lstop > stop {
+ lstop = stop
+ }
+ continue
+ }
+ }
+
+ for c := range inputs {
+ in := inputs[c][lstart:lstop]
+ for iRow := 0; iRow < len(outputs); iRow++ {
+ if c == 0 && clear {
+ galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
+ } else {
+ galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
+ }
+ }
+ }
+ lstart = lstop
+ lstop += r.o.perRound
+ if lstop > stop {
+ lstop = stop
+ }
+ }
+ }
+ if gor == 1 {
+ wg.Add(1)
+ exec(0, byteCount)
+ return
+ }
+
+ // Make sizes divisible by 64
+ do = (do + 63) & (^63)
+ start := 0
+ for start < byteCount {
+ if start+do > byteCount {
+ do = byteCount - start
+ }
+
+ wg.Add(1)
+ go exec(start, start+do)
+ start += do
+ }
+ wg.Wait()
+}
+
+// Perform the same as codeSomeShards, but split the workload into
+// several goroutines.
+// If clear is set, the first write will overwrite the output.
+func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) {
+ var wg sync.WaitGroup
+ gor := r.o.maxGoroutines
+
+ type state struct {
+ input [][]byte
+ output [][]byte
+ m []uint64
+ first bool
+ }
+ // Make a plan...
+ plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs))
+
+ // Flips between input first to output first.
+ // We put the smallest data load in the inner loop.
+ if len(inputs) > len(outputs) {
+ inIdx := 0
+ ins := inputs
+ for len(ins) > 0 {
+ inPer := ins
+ if len(inPer) > maxAvx2Inputs {
+ inPer = inPer[:maxAvx2Inputs]
+ }
+ outs := outputs
+ outIdx := 0
+ for len(outs) > 0 {
+ outPer := outs
+ if len(outPer) > maxAvx2Outputs {
+ outPer = outPer[:maxAvx2Outputs]
+ }
+ // Generate local matrix
+ m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer)))
+ plan = append(plan, state{
+ input: inPer,
+ output: outPer,
+ m: m,
+ first: inIdx == 0 && clear,
+ })
+ outIdx += len(outPer)
+ outs = outs[len(outPer):]
+ }
+ inIdx += len(inPer)
+ ins = ins[len(inPer):]
+ }
+ } else {
+ outs := outputs
+ outIdx := 0
+ for len(outs) > 0 {
+ outPer := outs
+ if len(outPer) > maxAvx2Outputs {
+ outPer = outPer[:maxAvx2Outputs]
+ }
+
+ inIdx := 0
+ ins := inputs
+ for len(ins) > 0 {
+ inPer := ins
+ if len(inPer) > maxAvx2Inputs {
+ inPer = inPer[:maxAvx2Inputs]
+ }
+ // Generate local matrix
+ m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer)))
+ //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound)
+ plan = append(plan, state{
+ input: inPer,
+ output: outPer,
+ m: m,
+ first: inIdx == 0 && clear,
+ })
+ inIdx += len(inPer)
+ ins = ins[len(inPer):]
+ }
+ outIdx += len(outPer)
+ outs = outs[len(outPer):]
+ }
+ }
+
+ do := byteCount / gor
+ if do < r.o.minSplitSize {
+ do = r.o.minSplitSize
+ }
+
+ exec := func(start, stop int) {
+ defer wg.Done()
+ lstart, lstop := start, start+r.o.perRound
+ if lstop > stop {
+ lstop = stop
+ }
+ for lstart < stop {
+ if lstop-lstart >= minAvx2Size {
+ // Execute plan...
+ var n int
+ if r.o.useAvx512GFNI {
+ for _, p := range plan {
+ if p.first {
+ n = galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop)
+ } else {
+ n = galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop)
+ }
+ }
+ } else {
+ for _, p := range plan {
+ if p.first {
+ n = galMulSlicesAvxGFNI(p.m, p.input, p.output, lstart, lstop)
+ } else {
+ n = galMulSlicesAvxGFNIXor(p.m, p.input, p.output, lstart, lstop)
+ }
+ }
+ }
+ lstart += n
+ if lstart == lstop {
+ lstop += r.o.perRound
+ if lstop > stop {
+ lstop = stop
+ }
+ continue
+ }
+ }
+
+ for c := range inputs {
+ in := inputs[c][lstart:lstop]
+ for iRow := 0; iRow < len(outputs); iRow++ {
+ if c == 0 && clear {
+ galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
+ } else {
+ galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
+ }
+ }
+ }
+ lstart = lstop
+ lstop += r.o.perRound
+ if lstop > stop {
+ lstop = stop
+ }
+ }
+ }
+
+ if gor == 1 {
+ wg.Add(1)
+ exec(0, byteCount)
+ return
+ }
+
+ // Make sizes divisible by 64
+ do = (do + 63) & (^63)
+ start := 0
+ for start < byteCount {
+ if start+do > byteCount {
+ do = byteCount - start
+ }
+
+ wg.Add(1)
+ go exec(start, start+do)
+ start += do
+ }
+ wg.Wait()
+}
+
+// checkSomeShards is mostly the same as codeSomeShards,
+// except this will check values and return
+// as soon as a difference is found.
+func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, byteCount int) bool {
+ if len(toCheck) == 0 {
+ return true
+ }
+
+ outputs := AllocAligned(len(toCheck), byteCount)
+ r.codeSomeShards(matrixRows, inputs, outputs, byteCount)
+
+ for i, calc := range outputs {
+ if !bytes.Equal(calc, toCheck[i]) {
+ return false
+ }
+ }
+ return true
+}
+
+// ErrShardNoData will be returned if there are no shards,
+// or if the length of all shards is zero.
+var ErrShardNoData = errors.New("no shard data")
+
+// ErrShardSize is returned if shard length isn't the same for all
+// shards.
+var ErrShardSize = errors.New("shard sizes do not match")
+
+// ErrInvalidShardSize is returned if shard length doesn't meet the requirements,
+// typically a multiple of N.
+var ErrInvalidShardSize = errors.New("invalid shard size")
+
+// checkShards will check if shards are the same size
+// or 0, if allowed. An error is returned if this fails.
+// An error is also returned if all shards are size 0.
+func checkShards(shards [][]byte, nilok bool) error {
+ size := shardSize(shards)
+ if size == 0 {
+ return ErrShardNoData
+ }
+ for _, shard := range shards {
+ if len(shard) != size {
+ if len(shard) != 0 || !nilok {
+ return ErrShardSize
+ }
+ }
+ }
+ return nil
+}
+
+// shardSize return the size of a single shard.
+// The first non-zero size is returned,
+// or 0 if all shards are size 0.
+func shardSize(shards [][]byte) int {
+ for _, shard := range shards {
+ if len(shard) != 0 {
+ return len(shard)
+ }
+ }
+ return 0
+}
+
+// Reconstruct will recreate the missing shards, if possible.
+//
+// Given a list of shards, some of which contain data, fills in the
+// ones that don't have data.
+//
+// The length of the array must be equal to shards.
+// You indicate that a shard is missing by setting it to nil or zero-length.
+// If a shard is zero-length but has sufficient capacity, that memory will
+// be used, otherwise a new []byte will be allocated.
+//
+// If there are too few shards to reconstruct the missing
+// ones, ErrTooFewShards will be returned.
+//
+// The reconstructed shard set is complete, but integrity is not verified.
+// Use the Verify function to check if data set is ok.
+func (r *reedSolomon) Reconstruct(shards [][]byte) error {
+ return r.reconstruct(shards, false, nil)
+}
+
+// ReconstructData will recreate any missing data shards, if possible.
+//
+// Given a list of shards, some of which contain data, fills in the
+// data shards that don't have data.
+//
+// The length of the array must be equal to shards.
+// You indicate that a shard is missing by setting it to nil or zero-length.
+// If a shard is zero-length but has sufficient capacity, that memory will
+// be used, otherwise a new []byte will be allocated.
+//
+// If there are too few shards to reconstruct the missing
+// ones, ErrTooFewShards will be returned.
+//
+// As the reconstructed shard set may contain missing parity shards,
+// calling the Verify function is likely to fail.
+func (r *reedSolomon) ReconstructData(shards [][]byte) error {
+ return r.reconstruct(shards, true, nil)
+}
+
+// ReconstructSome will recreate only requested shards, if possible.
+//
+// Given a list of shards, some of which contain data, fills in the
+// shards indicated by true values in the "required" parameter.
+// The length of the "required" array must be equal to either Shards or DataShards.
+// If the length is equal to DataShards, the reconstruction of parity shards will be ignored.
+//
+// The length of "shards" array must be equal to Shards.
+// You indicate that a shard is missing by setting it to nil or zero-length.
+// If a shard is zero-length but has sufficient capacity, that memory will
+// be used, otherwise a new []byte will be allocated.
+//
+// If there are too few shards to reconstruct the missing
+// ones, ErrTooFewShards will be returned.
+//
+// As the reconstructed shard set may contain missing parity shards,
+// calling the Verify function is likely to fail.
+func (r *reedSolomon) ReconstructSome(shards [][]byte, required []bool) error {
+ if len(required) == r.totalShards {
+ return r.reconstruct(shards, false, required)
+ }
+ return r.reconstruct(shards, true, required)
+}
+
+// reconstruct will recreate the missing data totalShards, and unless
+// dataOnly is true, also the missing parity totalShards
+//
+// The length of "shards" array must be equal to totalShards.
+// You indicate that a shard is missing by setting it to nil.
+//
+// If there are too few totalShards to reconstruct the missing
+// ones, ErrTooFewShards will be returned.
+func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool, required []bool) error {
+ if len(shards) != r.totalShards || required != nil && len(required) < r.dataShards {
+ return ErrTooFewShards
+ }
+ // Check arguments.
+ err := checkShards(shards, true)
+ if err != nil {
+ return err
+ }
+
+ shardSize := shardSize(shards)
+
+ // Quick check: are all of the shards present? If so, there's
+ // nothing to do.
+ numberPresent := 0
+ dataPresent := 0
+ missingRequired := 0
+ for i := 0; i < r.totalShards; i++ {
+ if len(shards[i]) != 0 {
+ numberPresent++
+ if i < r.dataShards {
+ dataPresent++
+ }
+ } else if required != nil && required[i] {
+ missingRequired++
+ }
+ }
+ if numberPresent == r.totalShards || dataOnly && dataPresent == r.dataShards ||
+ required != nil && missingRequired == 0 {
+ // Cool. All of the shards have data. We don't
+ // need to do anything.
+ return nil
+ }
+
+ // More complete sanity check
+ if numberPresent < r.dataShards {
+ return ErrTooFewShards
+ }
+
+ // Pull out an array holding just the shards that
+ // correspond to the rows of the submatrix. These shards
+ // will be the input to the decoding process that re-creates
+ // the missing data shards.
+ //
+ // Also, create an array of indices of the valid rows we do have
+ // and the invalid rows we don't have up until we have enough valid rows.
+ subShards := make([][]byte, r.dataShards)
+ validIndices := make([]int, r.dataShards)
+ invalidIndices := make([]int, 0)
+ subMatrixRow := 0
+ for matrixRow := 0; matrixRow < r.totalShards && subMatrixRow < r.dataShards; matrixRow++ {
+ if len(shards[matrixRow]) != 0 {
+ subShards[subMatrixRow] = shards[matrixRow]
+ validIndices[subMatrixRow] = matrixRow
+ subMatrixRow++
+ } else {
+ invalidIndices = append(invalidIndices, matrixRow)
+ }
+ }
+
+ // Attempt to get the cached inverted matrix out of the tree
+ // based on the indices of the invalid rows.
+ dataDecodeMatrix := r.tree.GetInvertedMatrix(invalidIndices)
+
+ // If the inverted matrix isn't cached in the tree yet we must
+ // construct it ourselves and insert it into the tree for the
+ // future. In this way the inversion tree is lazily loaded.
+ if dataDecodeMatrix == nil {
+ // Pull out the rows of the matrix that correspond to the
+ // shards that we have and build a square matrix. This
+ // matrix could be used to generate the shards that we have
+ // from the original data.
+ subMatrix, _ := newMatrix(r.dataShards, r.dataShards)
+ for subMatrixRow, validIndex := range validIndices {
+ for c := 0; c < r.dataShards; c++ {
+ subMatrix[subMatrixRow][c] = r.m[validIndex][c]
+ }
+ }
+ // Invert the matrix, so we can go from the encoded shards
+ // back to the original data. Then pull out the row that
+ // generates the shard that we want to decode. Note that
+ // since this matrix maps back to the original data, it can
+ // be used to create a data shard, but not a parity shard.
+ dataDecodeMatrix, err = subMatrix.Invert()
+ if err != nil {
+ return err
+ }
+
+ // Cache the inverted matrix in the tree for future use keyed on the
+ // indices of the invalid rows.
+ err = r.tree.InsertInvertedMatrix(invalidIndices, dataDecodeMatrix, r.totalShards)
+ if err != nil {
+ return err
+ }
+ }
+
+ // Re-create any data shards that were missing.
+ //
+ // The input to the coding is all of the shards we actually
+ // have, and the output is the missing data shards. The computation
+ // is done using the special decode matrix we just built.
+ outputs := make([][]byte, r.parityShards)
+ matrixRows := make([][]byte, r.parityShards)
+ outputCount := 0
+
+ for iShard := 0; iShard < r.dataShards; iShard++ {
+ if len(shards[iShard]) == 0 && (required == nil || required[iShard]) {
+ if cap(shards[iShard]) >= shardSize {
+ shards[iShard] = shards[iShard][0:shardSize]
+ } else {
+ shards[iShard] = AllocAligned(1, shardSize)[0]
+ }
+ outputs[outputCount] = shards[iShard]
+ matrixRows[outputCount] = dataDecodeMatrix[iShard]
+ outputCount++
+ }
+ }
+ r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], shardSize)
+
+ if dataOnly {
+ // Exit out early if we are only interested in the data shards
+ return nil
+ }
+
+ // Now that we have all of the data shards intact, we can
+ // compute any of the parity that is missing.
+ //
+ // The input to the coding is ALL of the data shards, including
+ // any that we just calculated. The output is whichever of the
+ // data shards were missing.
+ outputCount = 0
+ for iShard := r.dataShards; iShard < r.totalShards; iShard++ {
+ if len(shards[iShard]) == 0 && (required == nil || required[iShard]) {
+ if cap(shards[iShard]) >= shardSize {
+ shards[iShard] = shards[iShard][0:shardSize]
+ } else {
+ shards[iShard] = AllocAligned(1, shardSize)[0]
+ }
+ outputs[outputCount] = shards[iShard]
+ matrixRows[outputCount] = r.parity[iShard-r.dataShards]
+ outputCount++
+ }
+ }
+ r.codeSomeShards(matrixRows, shards[:r.dataShards], outputs[:outputCount], shardSize)
+ return nil
+}
+
+// ErrShortData will be returned by Split(), if there isn't enough data
+// to fill the number of shards.
+var ErrShortData = errors.New("not enough data to fill the number of requested shards")
+
+// Split a data slice into the number of shards given to the encoder,
+// and create empty parity shards if necessary.
+//
+// The data will be split into equally sized shards.
+// If the data size isn't divisible by the number of shards,
+// the last shard will contain extra zeros.
+//
+// If there is extra capacity on the provided data slice
+// it will be used instead of allocating parity shards.
+// It will be zeroed out.
+//
+// There must be at least 1 byte otherwise ErrShortData will be
+// returned.
+//
+// The data will not be copied, except for the last shard, so you
+// should not modify the data of the input slice afterwards.
+func (r *reedSolomon) Split(data []byte) ([][]byte, error) {
+ if len(data) == 0 {
+ return nil, ErrShortData
+ }
+ if r.totalShards == 1 {
+ return [][]byte{data}, nil
+ }
+
+ dataLen := len(data)
+ // Calculate number of bytes per data shard.
+ perShard := (len(data) + r.dataShards - 1) / r.dataShards
+ needTotal := r.totalShards * perShard
+
+ if cap(data) > len(data) {
+ if cap(data) > needTotal {
+ data = data[:needTotal]
+ } else {
+ data = data[:cap(data)]
+ }
+ clear := data[dataLen:]
+ for i := range clear {
+ clear[i] = 0
+ }
+ }
+
+ // Only allocate memory if necessary
+ var padding [][]byte
+ if len(data) < needTotal {
+ // calculate maximum number of full shards in `data` slice
+ fullShards := len(data) / perShard
+ padding = AllocAligned(r.totalShards-fullShards, perShard)
+
+ if dataLen > perShard*fullShards {
+ // Copy partial shards
+ copyFrom := data[perShard*fullShards : dataLen]
+ for i := range padding {
+ if len(copyFrom) == 0 {
+ break
+ }
+ copyFrom = copyFrom[copy(padding[i], copyFrom):]
+ }
+ }
+ }
+
+ // Split into equal-length shards.
+ dst := make([][]byte, r.totalShards)
+ i := 0
+ for ; i < len(dst) && len(data) >= perShard; i++ {
+ dst[i] = data[:perShard:perShard]
+ data = data[perShard:]
+ }
+
+ for j := 0; i+j < len(dst); j++ {
+ dst[i+j] = padding[0]
+ padding = padding[1:]
+ }
+
+ return dst, nil
+}
+
+// ErrReconstructRequired is returned if too few data shards are intact and a
+// reconstruction is required before you can successfully join the shards.
+var ErrReconstructRequired = errors.New("reconstruction required as one or more required data shards are nil")
+
+// Join the shards and write the data segment to dst.
+//
+// Only the data shards are considered.
+// You must supply the exact output size you want.
+//
+// If there are to few shards given, ErrTooFewShards will be returned.
+// If the total data size is less than outSize, ErrShortData will be returned.
+// If one or more required data shards are nil, ErrReconstructRequired will be returned.
+func (r *reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error {
+ // Do we have enough shards?
+ if len(shards) < r.dataShards {
+ return ErrTooFewShards
+ }
+ shards = shards[:r.dataShards]
+
+ // Do we have enough data?
+ size := 0
+ for _, shard := range shards {
+ if shard == nil {
+ return ErrReconstructRequired
+ }
+ size += len(shard)
+
+ // Do we have enough data already?
+ if size >= outSize {
+ break
+ }
+ }
+ if size < outSize {
+ return ErrShortData
+ }
+
+ // Copy data to dst
+ write := outSize
+ for _, shard := range shards {
+ if write < len(shard) {
+ _, err := dst.Write(shard[:write])
+ return err
+ }
+ n, err := dst.Write(shard)
+ if err != nil {
+ return err
+ }
+ write -= n
+ }
+ return nil
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/streaming.go b/vendor/github.com/klauspost/reedsolomon/streaming.go
new file mode 100644
index 000000000..f7aba3b89
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/streaming.go
@@ -0,0 +1,614 @@
+/**
+ * Reed-Solomon Coding over 8-bit values.
+ *
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc.
+ */
+
+package reedsolomon
+
+import (
+ "errors"
+ "fmt"
+ "io"
+ "sync"
+)
+
+// StreamEncoder is an interface to encode Reed-Salomon parity sets for your data.
+// It provides a fully streaming interface, and processes data in blocks of up to 4MB.
+//
+// For small shard sizes, 10MB and below, it is recommended to use the in-memory interface,
+// since the streaming interface has a start up overhead.
+//
+// For all operations, no readers and writers should not assume any order/size of
+// individual reads/writes.
+//
+// For usage examples, see "stream-encoder.go" and "streamdecoder.go" in the examples
+// folder.
+type StreamEncoder interface {
+ // Encode parity shards for a set of data shards.
+ //
+ // Input is 'shards' containing readers for data shards followed by parity shards
+ // io.Writer.
+ //
+ // The number of shards must match the number given to NewStream().
+ //
+ // Each reader must supply the same number of bytes.
+ //
+ // The parity shards will be written to the writer.
+ // The number of bytes written will match the input size.
+ //
+ // If a data stream returns an error, a StreamReadError type error
+ // will be returned. If a parity writer returns an error, a
+ // StreamWriteError will be returned.
+ Encode(data []io.Reader, parity []io.Writer) error
+
+ // Verify returns true if the parity shards contain correct data.
+ //
+ // The number of shards must match the number total data+parity shards
+ // given to NewStream().
+ //
+ // Each reader must supply the same number of bytes.
+ // If a shard stream returns an error, a StreamReadError type error
+ // will be returned.
+ Verify(shards []io.Reader) (bool, error)
+
+ // Reconstruct will recreate the missing shards if possible.
+ //
+ // Given a list of valid shards (to read) and invalid shards (to write)
+ //
+ // You indicate that a shard is missing by setting it to nil in the 'valid'
+ // slice and at the same time setting a non-nil writer in "fill".
+ // An index cannot contain both non-nil 'valid' and 'fill' entry.
+ // If both are provided 'ErrReconstructMismatch' is returned.
+ //
+ // If there are too few shards to reconstruct the missing
+ // ones, ErrTooFewShards will be returned.
+ //
+ // The reconstructed shard set is complete, but integrity is not verified.
+ // Use the Verify function to check if data set is ok.
+ Reconstruct(valid []io.Reader, fill []io.Writer) error
+
+ // Split a an input stream into the number of shards given to the encoder.
+ //
+ // The data will be split into equally sized shards.
+ // If the data size isn't dividable by the number of shards,
+ // the last shard will contain extra zeros.
+ //
+ // You must supply the total size of your input.
+ // 'ErrShortData' will be returned if it is unable to retrieve the
+ // number of bytes indicated.
+ Split(data io.Reader, dst []io.Writer, size int64) (err error)
+
+ // Join the shards and write the data segment to dst.
+ //
+ // Only the data shards are considered.
+ //
+ // You must supply the exact output size you want.
+ // If there are to few shards given, ErrTooFewShards will be returned.
+ // If the total data size is less than outSize, ErrShortData will be returned.
+ Join(dst io.Writer, shards []io.Reader, outSize int64) error
+}
+
+// StreamReadError is returned when a read error is encountered
+// that relates to a supplied stream.
+// This will allow you to find out which reader has failed.
+type StreamReadError struct {
+ Err error // The error
+ Stream int // The stream number on which the error occurred
+}
+
+// Error returns the error as a string
+func (s StreamReadError) Error() string {
+ return fmt.Sprintf("error reading stream %d: %s", s.Stream, s.Err)
+}
+
+// String returns the error as a string
+func (s StreamReadError) String() string {
+ return s.Error()
+}
+
+// StreamWriteError is returned when a write error is encountered
+// that relates to a supplied stream. This will allow you to
+// find out which reader has failed.
+type StreamWriteError struct {
+ Err error // The error
+ Stream int // The stream number on which the error occurred
+}
+
+// Error returns the error as a string
+func (s StreamWriteError) Error() string {
+ return fmt.Sprintf("error writing stream %d: %s", s.Stream, s.Err)
+}
+
+// String returns the error as a string
+func (s StreamWriteError) String() string {
+ return s.Error()
+}
+
+// rsStream contains a matrix for a specific
+// distribution of datashards and parity shards.
+// Construct if using NewStream()
+type rsStream struct {
+ r *reedSolomon
+ o options
+
+ // Shard reader
+ readShards func(dst [][]byte, in []io.Reader) error
+ // Shard writer
+ writeShards func(out []io.Writer, in [][]byte) error
+
+ blockPool sync.Pool
+}
+
+// NewStream creates a new encoder and initializes it to
+// the number of data shards and parity shards that
+// you want to use. You can reuse this encoder.
+// Note that the maximum number of data shards is 256.
+func NewStream(dataShards, parityShards int, o ...Option) (StreamEncoder, error) {
+ if dataShards+parityShards > 256 {
+ return nil, ErrMaxShardNum
+ }
+
+ r := rsStream{o: defaultOptions}
+ for _, opt := range o {
+ opt(&r.o)
+ }
+ // Override block size if shard size is set.
+ if r.o.streamBS == 0 && r.o.shardSize > 0 {
+ r.o.streamBS = r.o.shardSize
+ }
+ if r.o.streamBS <= 0 {
+ r.o.streamBS = 4 << 20
+ }
+ if r.o.shardSize == 0 && r.o.maxGoroutines == defaultOptions.maxGoroutines {
+ o = append(o, WithAutoGoroutines(r.o.streamBS))
+ }
+
+ enc, err := New(dataShards, parityShards, o...)
+ if err != nil {
+ return nil, err
+ }
+ r.r = enc.(*reedSolomon)
+
+ r.blockPool.New = func() interface{} {
+ return AllocAligned(dataShards+parityShards, r.o.streamBS)
+ }
+ r.readShards = readShards
+ r.writeShards = writeShards
+ if r.o.concReads {
+ r.readShards = cReadShards
+ }
+ if r.o.concWrites {
+ r.writeShards = cWriteShards
+ }
+
+ return &r, err
+}
+
+// NewStreamC creates a new encoder and initializes it to
+// the number of data shards and parity shards given.
+//
+// This functions as 'NewStream', but allows you to enable CONCURRENT reads and writes.
+func NewStreamC(dataShards, parityShards int, conReads, conWrites bool, o ...Option) (StreamEncoder, error) {
+ return NewStream(dataShards, parityShards, append(o, WithConcurrentStreamReads(conReads), WithConcurrentStreamWrites(conWrites))...)
+}
+
+func (r *rsStream) createSlice() [][]byte {
+ out := r.blockPool.Get().([][]byte)
+ for i := range out {
+ out[i] = out[i][:r.o.streamBS]
+ }
+ return out
+}
+
+// Encodes parity shards for a set of data shards.
+//
+// Input is 'shards' containing readers for data shards followed by parity shards
+// io.Writer.
+//
+// The number of shards must match the number given to NewStream().
+//
+// Each reader must supply the same number of bytes.
+//
+// The parity shards will be written to the writer.
+// The number of bytes written will match the input size.
+//
+// If a data stream returns an error, a StreamReadError type error
+// will be returned. If a parity writer returns an error, a
+// StreamWriteError will be returned.
+func (r *rsStream) Encode(data []io.Reader, parity []io.Writer) error {
+ if len(data) != r.r.dataShards {
+ return ErrTooFewShards
+ }
+
+ if len(parity) != r.r.parityShards {
+ return ErrTooFewShards
+ }
+
+ all := r.createSlice()
+ defer r.blockPool.Put(all)
+ in := all[:r.r.dataShards]
+ out := all[r.r.dataShards:]
+ read := 0
+
+ for {
+ err := r.readShards(in, data)
+ switch err {
+ case nil:
+ case io.EOF:
+ if read == 0 {
+ return ErrShardNoData
+ }
+ return nil
+ default:
+ return err
+ }
+ out = trimShards(out, shardSize(in))
+ read += shardSize(in)
+ err = r.r.Encode(all)
+ if err != nil {
+ return err
+ }
+ err = r.writeShards(parity, out)
+ if err != nil {
+ return err
+ }
+ }
+}
+
+// Trim the shards so they are all the same size
+func trimShards(in [][]byte, size int) [][]byte {
+ for i := range in {
+ if len(in[i]) != 0 {
+ in[i] = in[i][0:size]
+ }
+ if len(in[i]) < size {
+ in[i] = in[i][:0]
+ }
+ }
+ return in
+}
+
+func readShards(dst [][]byte, in []io.Reader) error {
+ if len(in) != len(dst) {
+ panic("internal error: in and dst size do not match")
+ }
+ size := -1
+ for i := range in {
+ if in[i] == nil {
+ dst[i] = dst[i][:0]
+ continue
+ }
+ n, err := io.ReadFull(in[i], dst[i])
+ // The error is EOF only if no bytes were read.
+ // If an EOF happens after reading some but not all the bytes,
+ // ReadFull returns ErrUnexpectedEOF.
+ switch err {
+ case io.ErrUnexpectedEOF, io.EOF:
+ if size < 0 {
+ size = n
+ } else if n != size {
+ // Shard sizes must match.
+ return ErrShardSize
+ }
+ dst[i] = dst[i][0:n]
+ case nil:
+ continue
+ default:
+ return StreamReadError{Err: err, Stream: i}
+ }
+ }
+ if size == 0 {
+ return io.EOF
+ }
+ return nil
+}
+
+func writeShards(out []io.Writer, in [][]byte) error {
+ if len(out) != len(in) {
+ panic("internal error: in and out size do not match")
+ }
+ for i := range in {
+ if out[i] == nil {
+ continue
+ }
+ n, err := out[i].Write(in[i])
+ if err != nil {
+ return StreamWriteError{Err: err, Stream: i}
+ }
+ //
+ if n != len(in[i]) {
+ return StreamWriteError{Err: io.ErrShortWrite, Stream: i}
+ }
+ }
+ return nil
+}
+
+type readResult struct {
+ n int
+ size int
+ err error
+}
+
+// cReadShards reads shards concurrently
+func cReadShards(dst [][]byte, in []io.Reader) error {
+ if len(in) != len(dst) {
+ panic("internal error: in and dst size do not match")
+ }
+ var wg sync.WaitGroup
+ wg.Add(len(in))
+ res := make(chan readResult, len(in))
+ for i := range in {
+ if in[i] == nil {
+ dst[i] = dst[i][:0]
+ wg.Done()
+ continue
+ }
+ go func(i int) {
+ defer wg.Done()
+ n, err := io.ReadFull(in[i], dst[i])
+ // The error is EOF only if no bytes were read.
+ // If an EOF happens after reading some but not all the bytes,
+ // ReadFull returns ErrUnexpectedEOF.
+ res <- readResult{size: n, err: err, n: i}
+
+ }(i)
+ }
+ wg.Wait()
+ close(res)
+ size := -1
+ for r := range res {
+ switch r.err {
+ case io.ErrUnexpectedEOF, io.EOF:
+ if size < 0 {
+ size = r.size
+ } else if r.size != size {
+ // Shard sizes must match.
+ return ErrShardSize
+ }
+ dst[r.n] = dst[r.n][0:r.size]
+ case nil:
+ default:
+ return StreamReadError{Err: r.err, Stream: r.n}
+ }
+ }
+ if size == 0 {
+ return io.EOF
+ }
+ return nil
+}
+
+// cWriteShards writes shards concurrently
+func cWriteShards(out []io.Writer, in [][]byte) error {
+ if len(out) != len(in) {
+ panic("internal error: in and out size do not match")
+ }
+ var errs = make(chan error, len(out))
+ var wg sync.WaitGroup
+ wg.Add(len(out))
+ for i := range in {
+ go func(i int) {
+ defer wg.Done()
+ if out[i] == nil {
+ errs <- nil
+ return
+ }
+ n, err := out[i].Write(in[i])
+ if err != nil {
+ errs <- StreamWriteError{Err: err, Stream: i}
+ return
+ }
+ if n != len(in[i]) {
+ errs <- StreamWriteError{Err: io.ErrShortWrite, Stream: i}
+ }
+ }(i)
+ }
+ wg.Wait()
+ close(errs)
+ for err := range errs {
+ if err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+// Verify returns true if the parity shards contain correct data.
+//
+// The number of shards must match the number total data+parity shards
+// given to NewStream().
+//
+// Each reader must supply the same number of bytes.
+// If a shard stream returns an error, a StreamReadError type error
+// will be returned.
+func (r *rsStream) Verify(shards []io.Reader) (bool, error) {
+ if len(shards) != r.r.totalShards {
+ return false, ErrTooFewShards
+ }
+
+ read := 0
+ all := r.createSlice()
+ defer r.blockPool.Put(all)
+ for {
+ err := r.readShards(all, shards)
+ if err == io.EOF {
+ if read == 0 {
+ return false, ErrShardNoData
+ }
+ return true, nil
+ }
+ if err != nil {
+ return false, err
+ }
+ read += shardSize(all)
+ ok, err := r.r.Verify(all)
+ if !ok || err != nil {
+ return ok, err
+ }
+ }
+}
+
+// ErrReconstructMismatch is returned by the StreamEncoder, if you supply
+// "valid" and "fill" streams on the same index.
+// Therefore it is impossible to see if you consider the shard valid
+// or would like to have it reconstructed.
+var ErrReconstructMismatch = errors.New("valid shards and fill shards are mutually exclusive")
+
+// Reconstruct will recreate the missing shards if possible.
+//
+// Given a list of valid shards (to read) and invalid shards (to write)
+//
+// You indicate that a shard is missing by setting it to nil in the 'valid'
+// slice and at the same time setting a non-nil writer in "fill".
+// An index cannot contain both non-nil 'valid' and 'fill' entry.
+//
+// If there are too few shards to reconstruct the missing
+// ones, ErrTooFewShards will be returned.
+//
+// The reconstructed shard set is complete when explicitly asked for all missing shards.
+// However its integrity is not automatically verified.
+// Use the Verify function to check in case the data set is complete.
+func (r *rsStream) Reconstruct(valid []io.Reader, fill []io.Writer) error {
+ if len(valid) != r.r.totalShards {
+ return ErrTooFewShards
+ }
+ if len(fill) != r.r.totalShards {
+ return ErrTooFewShards
+ }
+
+ all := r.createSlice()
+ defer r.blockPool.Put(all)
+ reconDataOnly := true
+ for i := range valid {
+ if valid[i] != nil && fill[i] != nil {
+ return ErrReconstructMismatch
+ }
+ if i >= r.r.dataShards && fill[i] != nil {
+ reconDataOnly = false
+ }
+ }
+
+ read := 0
+ for {
+ err := r.readShards(all, valid)
+ if err == io.EOF {
+ if read == 0 {
+ return ErrShardNoData
+ }
+ return nil
+ }
+ if err != nil {
+ return err
+ }
+ read += shardSize(all)
+ all = trimShards(all, shardSize(all))
+
+ if reconDataOnly {
+ err = r.r.ReconstructData(all) // just reconstruct missing data shards
+ } else {
+ err = r.r.Reconstruct(all) // reconstruct all missing shards
+ }
+ if err != nil {
+ return err
+ }
+ err = r.writeShards(fill, all)
+ if err != nil {
+ return err
+ }
+ }
+}
+
+// Join the shards and write the data segment to dst.
+//
+// Only the data shards are considered.
+//
+// You must supply the exact output size you want.
+// If there are to few shards given, ErrTooFewShards will be returned.
+// If the total data size is less than outSize, ErrShortData will be returned.
+func (r *rsStream) Join(dst io.Writer, shards []io.Reader, outSize int64) error {
+ // Do we have enough shards?
+ if len(shards) < r.r.dataShards {
+ return ErrTooFewShards
+ }
+
+ // Trim off parity shards if any
+ shards = shards[:r.r.dataShards]
+ for i := range shards {
+ if shards[i] == nil {
+ return StreamReadError{Err: ErrShardNoData, Stream: i}
+ }
+ }
+ // Join all shards
+ src := io.MultiReader(shards...)
+
+ // Copy data to dst
+ n, err := io.CopyN(dst, src, outSize)
+ if err == io.EOF {
+ return ErrShortData
+ }
+ if err != nil {
+ return err
+ }
+ if n != outSize {
+ return ErrShortData
+ }
+ return nil
+}
+
+// Split a an input stream into the number of shards given to the encoder.
+//
+// The data will be split into equally sized shards.
+// If the data size isn't dividable by the number of shards,
+// the last shard will contain extra zeros.
+//
+// You must supply the total size of your input.
+// 'ErrShortData' will be returned if it is unable to retrieve the
+// number of bytes indicated.
+func (r *rsStream) Split(data io.Reader, dst []io.Writer, size int64) error {
+ if size == 0 {
+ return ErrShortData
+ }
+ if len(dst) != r.r.dataShards {
+ return ErrInvShardNum
+ }
+
+ for i := range dst {
+ if dst[i] == nil {
+ return StreamWriteError{Err: ErrShardNoData, Stream: i}
+ }
+ }
+
+ // Calculate number of bytes per shard.
+ perShard := (size + int64(r.r.dataShards) - 1) / int64(r.r.dataShards)
+
+ // Pad data to r.Shards*perShard.
+ paddingSize := (int64(r.r.totalShards) * perShard) - size
+ data = io.MultiReader(data, io.LimitReader(zeroPaddingReader{}, paddingSize))
+
+ // Split into equal-length shards and copy.
+ for i := range dst {
+ n, err := io.CopyN(dst[i], data, perShard)
+ if err != io.EOF && err != nil {
+ return err
+ }
+ if n != perShard {
+ return ErrShortData
+ }
+ }
+
+ return nil
+}
+
+type zeroPaddingReader struct{}
+
+var _ io.Reader = &zeroPaddingReader{}
+
+func (t zeroPaddingReader) Read(p []byte) (n int, err error) {
+ n = len(p)
+ for i := 0; i < n; i++ {
+ p[i] = 0
+ }
+ return n, nil
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/unsafe.go b/vendor/github.com/klauspost/reedsolomon/unsafe.go
new file mode 100644
index 000000000..d85892f0f
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/unsafe.go
@@ -0,0 +1,41 @@
+//go:build !noasm && !nounsafe && !gccgo && !appengine
+
+/**
+ * Reed-Solomon Coding over 8-bit values.
+ *
+ * Copyright 2023, Klaus Post
+ */
+
+package reedsolomon
+
+import (
+ "unsafe"
+)
+
+// AllocAligned allocates 'shards' slices, with 'each' bytes.
+// Each slice will start on a 64 byte aligned boundary.
+func AllocAligned(shards, each int) [][]byte {
+ if false {
+ res := make([][]byte, shards)
+ for i := range res {
+ res[i] = make([]byte, each)
+ }
+ return res
+ }
+ const (
+ alignEach = 64
+ alignStart = 64
+ )
+ eachAligned := ((each + alignEach - 1) / alignEach) * alignEach
+ total := make([]byte, eachAligned*shards+63)
+ align := uint(uintptr(unsafe.Pointer(&total[0]))) & (alignStart - 1)
+ if align > 0 {
+ total = total[alignStart-align:]
+ }
+ res := make([][]byte, shards)
+ for i := range res {
+ res[i] = total[:each:eachAligned]
+ total = total[eachAligned:]
+ }
+ return res
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/unsafe_disabled.go b/vendor/github.com/klauspost/reedsolomon/unsafe_disabled.go
new file mode 100644
index 000000000..95cb8e6eb
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/unsafe_disabled.go
@@ -0,0 +1,23 @@
+//go:build noasm || nounsafe || gccgo || appengine
+
+/**
+ * Reed-Solomon Coding over 8-bit values.
+ *
+ * Copyright 2023, Klaus Post
+ */
+
+package reedsolomon
+
+// AllocAligned allocates 'shards' slices, with 'each' bytes.
+// Each slice will start on a 64 byte aligned boundary.
+func AllocAligned(shards, each int) [][]byte {
+ eachAligned := ((each + 63) / 64) * 64
+ total := make([]byte, eachAligned*shards+63)
+ // We cannot do initial align without "unsafe", just use native alignment.
+ res := make([][]byte, shards)
+ for i := range res {
+ res[i] = total[:each:eachAligned]
+ total = total[eachAligned:]
+ }
+ return res
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/xor_arm64.go b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go
new file mode 100644
index 000000000..6f0522f88
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go
@@ -0,0 +1,19 @@
+//go:build !noasm && !appengine && !gccgo
+
+package reedsolomon
+
+//go:noescape
+func xorSliceNEON(in, out []byte)
+
+// simple slice xor
+func sliceXor(in, out []byte, o *options) {
+ xorSliceNEON(in, out)
+ done := (len(in) >> 5) << 5
+
+ remain := len(in) - done
+ if remain > 0 {
+ for i := done; i < len(in); i++ {
+ out[i] ^= in[i]
+ }
+ }
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/xor_arm64.s b/vendor/github.com/klauspost/reedsolomon/xor_arm64.s
new file mode 100644
index 000000000..562987316
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/xor_arm64.s
@@ -0,0 +1,29 @@
+//+build !noasm
+//+build !appengine
+//+build !gccgo
+
+// func xorSliceNEON(in, out []byte)
+TEXT ·xorSliceNEON(SB), 7, $0
+ MOVD in_base+0(FP), R1
+ MOVD in_len+8(FP), R2 // length of message
+ MOVD out_base+24(FP), R5
+ SUBS $32, R2
+ BMI completeXor
+
+loopXor:
+ // Main loop
+ VLD1.P 32(R1), [V0.B16, V1.B16]
+ VLD1 (R5), [V20.B16, V21.B16]
+
+ VEOR V20.B16, V0.B16, V4.B16
+ VEOR V21.B16, V1.B16, V5.B16
+
+ // Store result
+ VST1.P [V4.D2, V5.D2], 32(R5)
+
+ SUBS $32, R2
+ BPL loopXor
+
+completeXor:
+ RET
+
diff --git a/vendor/github.com/klauspost/reedsolomon/xor_noasm.go b/vendor/github.com/klauspost/reedsolomon/xor_noasm.go
new file mode 100644
index 000000000..d3e29f90e
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/xor_noasm.go
@@ -0,0 +1,7 @@
+//go:build noasm || gccgo || appengine || (!amd64 && !arm64)
+
+package reedsolomon
+
+func sliceXor(in, out []byte, o *options) {
+ sliceXorGo(in, out, o)
+}
diff --git a/vendor/golang.org/x/sys/cpu/asm_aix_ppc64.s b/vendor/golang.org/x/sys/cpu/asm_aix_ppc64.s
index db9171c2e..269e173ca 100644
--- a/vendor/golang.org/x/sys/cpu/asm_aix_ppc64.s
+++ b/vendor/golang.org/x/sys/cpu/asm_aix_ppc64.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go
index 83f112c4c..4756ad5f7 100644
--- a/vendor/golang.org/x/sys/cpu/cpu.go
+++ b/vendor/golang.org/x/sys/cpu/cpu.go
@@ -38,7 +38,7 @@ var X86 struct {
HasAVX512F bool // Advanced vector extension 512 Foundation Instructions
HasAVX512CD bool // Advanced vector extension 512 Conflict Detection Instructions
HasAVX512ER bool // Advanced vector extension 512 Exponential and Reciprocal Instructions
- HasAVX512PF bool // Advanced vector extension 512 Prefetch Instructions Instructions
+ HasAVX512PF bool // Advanced vector extension 512 Prefetch Instructions
HasAVX512VL bool // Advanced vector extension 512 Vector Length Extensions
HasAVX512BW bool // Advanced vector extension 512 Byte and Word Instructions
HasAVX512DQ bool // Advanced vector extension 512 Doubleword and Quadword Instructions
@@ -54,6 +54,9 @@ var X86 struct {
HasAVX512VBMI2 bool // Advanced vector extension 512 Vector Byte Manipulation Instructions 2
HasAVX512BITALG bool // Advanced vector extension 512 Bit Algorithms
HasAVX512BF16 bool // Advanced vector extension 512 BFloat16 Instructions
+ HasAMXTile bool // Advanced Matrix Extension Tile instructions
+ HasAMXInt8 bool // Advanced Matrix Extension Int8 instructions
+ HasAMXBF16 bool // Advanced Matrix Extension BFloat16 instructions
HasBMI1 bool // Bit manipulation instruction set 1
HasBMI2 bool // Bit manipulation instruction set 2
HasCX16 bool // Compare and exchange 16 Bytes
diff --git a/vendor/golang.org/x/sys/cpu/cpu_aix.go b/vendor/golang.org/x/sys/cpu/cpu_aix.go
index 8aaeef545..9bf0c32eb 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_aix.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_aix.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix
-// +build aix
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_arm64.s b/vendor/golang.org/x/sys/cpu/cpu_arm64.s
index c61f95a05..fcb9a3888 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_arm64.s
+++ b/vendor/golang.org/x/sys/cpu/cpu_arm64.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_gc_arm64.go
index ccf542a73..a8acd3e32 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gc_arm64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gc_arm64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_s390x.go b/vendor/golang.org/x/sys/cpu/cpu_gc_s390x.go
index 0af2f2484..c8ae6ddc1 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gc_s390x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gc_s390x.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
index fa7cdb9bc..910728fb1 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (386 || amd64 || amd64p32) && gc
-// +build 386 amd64 amd64p32
-// +build gc
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo_arm64.go
index 2aff31891..7f1946780 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_arm64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_arm64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gccgo
-// +build gccgo
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_s390x.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo_s390x.go
index 4bfbda619..9526d2ce3 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_s390x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_s390x.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gccgo
-// +build gccgo
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c
index 6cc73109f..3f73a05dc 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c
+++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (386 || amd64 || amd64p32) && gccgo
-// +build 386 amd64 amd64p32
-// +build gccgo
#include
#include
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go
index 863d415ab..99c60fe9f 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (386 || amd64 || amd64p32) && gccgo
-// +build 386 amd64 amd64p32
-// +build gccgo
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux.go b/vendor/golang.org/x/sys/cpu/cpu_linux.go
index 159a686f6..743eb5435 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_linux.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build !386 && !amd64 && !amd64p32 && !arm64
-// +build !386,!amd64,!amd64p32,!arm64
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_mips64x.go b/vendor/golang.org/x/sys/cpu/cpu_linux_mips64x.go
index 6000db4cd..4686c1d54 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_linux_mips64x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_mips64x.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (mips64 || mips64le)
-// +build linux
-// +build mips64 mips64le
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
index f4992b1a5..cd63e7335 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x
-// +build linux,!arm,!arm64,!mips64,!mips64le,!ppc64,!ppc64le,!s390x
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_ppc64x.go b/vendor/golang.org/x/sys/cpu/cpu_linux_ppc64x.go
index 021356d6d..197188e67 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_linux_ppc64x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_ppc64x.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (ppc64 || ppc64le)
-// +build linux
-// +build ppc64 ppc64le
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_loong64.go b/vendor/golang.org/x/sys/cpu/cpu_loong64.go
index 0f57b05bd..558635850 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_loong64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_loong64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build loong64
-// +build loong64
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_mips64x.go b/vendor/golang.org/x/sys/cpu/cpu_mips64x.go
index f4063c664..fedb00cc4 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_mips64x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_mips64x.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build mips64 || mips64le
-// +build mips64 mips64le
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_mipsx.go b/vendor/golang.org/x/sys/cpu/cpu_mipsx.go
index 07c4e36d8..ffb4ec7eb 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_mipsx.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_mipsx.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build mips || mipsle
-// +build mips mipsle
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_arm.go b/vendor/golang.org/x/sys/cpu/cpu_other_arm.go
index d7b4fb4cc..e9ecf2a45 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_other_arm.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_other_arm.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build !linux && arm
-// +build !linux,arm
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go
index f3cde129b..5341e7f88 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build !linux && !netbsd && !openbsd && arm64
-// +build !linux,!netbsd,!openbsd,arm64
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_mips64x.go b/vendor/golang.org/x/sys/cpu/cpu_other_mips64x.go
index 0dafe9644..5f8f2419a 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_other_mips64x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_other_mips64x.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build !linux && (mips64 || mips64le)
-// +build !linux
-// +build mips64 mips64le
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_ppc64x.go b/vendor/golang.org/x/sys/cpu/cpu_other_ppc64x.go
index 060d46b6e..89608fba2 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_other_ppc64x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_other_ppc64x.go
@@ -3,9 +3,6 @@
// license that can be found in the LICENSE file.
//go:build !aix && !linux && (ppc64 || ppc64le)
-// +build !aix
-// +build !linux
-// +build ppc64 ppc64le
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_other_riscv64.go
index dd10eb79f..5ab87808f 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_other_riscv64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_other_riscv64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build !linux && riscv64
-// +build !linux,riscv64
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go b/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go
index 4e8acd165..c14f12b14 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build ppc64 || ppc64le
-// +build ppc64 ppc64le
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
index bd6c128af..7f0c79c00 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
@@ -3,10 +3,9 @@
// license that can be found in the LICENSE file.
//go:build riscv64
-// +build riscv64
package cpu
-const cacheLineSize = 32
+const cacheLineSize = 64
func initOptions() {}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_s390x.s b/vendor/golang.org/x/sys/cpu/cpu_s390x.s
index 96f81e209..1fb4b7013 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_s390x.s
+++ b/vendor/golang.org/x/sys/cpu/cpu_s390x.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/cpu/cpu_wasm.go b/vendor/golang.org/x/sys/cpu/cpu_wasm.go
index 7747d888a..384787ea3 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_wasm.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_wasm.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build wasm
-// +build wasm
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.go b/vendor/golang.org/x/sys/cpu/cpu_x86.go
index f5aacfc82..c29f5e4c5 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_x86.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build 386 || amd64 || amd64p32
-// +build 386 amd64 amd64p32
package cpu
@@ -37,6 +36,9 @@ func initOptions() {
{Name: "avx512vbmi2", Feature: &X86.HasAVX512VBMI2},
{Name: "avx512bitalg", Feature: &X86.HasAVX512BITALG},
{Name: "avx512bf16", Feature: &X86.HasAVX512BF16},
+ {Name: "amxtile", Feature: &X86.HasAMXTile},
+ {Name: "amxint8", Feature: &X86.HasAMXInt8},
+ {Name: "amxbf16", Feature: &X86.HasAMXBF16},
{Name: "bmi1", Feature: &X86.HasBMI1},
{Name: "bmi2", Feature: &X86.HasBMI2},
{Name: "cx16", Feature: &X86.HasCX16},
@@ -138,6 +140,10 @@ func archInit() {
eax71, _, _, _ := cpuid(7, 1)
X86.HasAVX512BF16 = isSet(5, eax71)
}
+
+ X86.HasAMXTile = isSet(24, edx7)
+ X86.HasAMXInt8 = isSet(25, edx7)
+ X86.HasAMXBF16 = isSet(22, edx7)
}
func isSet(bitpos uint, value uint32) bool {
diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.s b/vendor/golang.org/x/sys/cpu/cpu_x86.s
index 39acab2ff..7d7ba33ef 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.s
+++ b/vendor/golang.org/x/sys/cpu/cpu_x86.s
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (386 || amd64 || amd64p32) && gc
-// +build 386 amd64 amd64p32
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/cpu/endian_big.go b/vendor/golang.org/x/sys/cpu/endian_big.go
index 93ce03a34..7fe04b0a1 100644
--- a/vendor/golang.org/x/sys/cpu/endian_big.go
+++ b/vendor/golang.org/x/sys/cpu/endian_big.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build armbe || arm64be || m68k || mips || mips64 || mips64p32 || ppc || ppc64 || s390 || s390x || shbe || sparc || sparc64
-// +build armbe arm64be m68k mips mips64 mips64p32 ppc ppc64 s390 s390x shbe sparc sparc64
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/endian_little.go b/vendor/golang.org/x/sys/cpu/endian_little.go
index 55db853ef..48eccc4c7 100644
--- a/vendor/golang.org/x/sys/cpu/endian_little.go
+++ b/vendor/golang.org/x/sys/cpu/endian_little.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build 386 || amd64 || amd64p32 || alpha || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || nios2 || ppc64le || riscv || riscv64 || sh || wasm
-// +build 386 amd64 amd64p32 alpha arm arm64 loong64 mipsle mips64le mips64p32le nios2 ppc64le riscv riscv64 sh wasm
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/hwcap_linux.go b/vendor/golang.org/x/sys/cpu/hwcap_linux.go
index 1d9d91f3e..34e49f955 100644
--- a/vendor/golang.org/x/sys/cpu/hwcap_linux.go
+++ b/vendor/golang.org/x/sys/cpu/hwcap_linux.go
@@ -5,7 +5,7 @@
package cpu
import (
- "io/ioutil"
+ "os"
)
const (
@@ -39,7 +39,7 @@ func readHWCAP() error {
return nil
}
- buf, err := ioutil.ReadFile(procAuxv)
+ buf, err := os.ReadFile(procAuxv)
if err != nil {
// e.g. on android /proc/self/auxv is not accessible, so silently
// ignore the error and leave Initialized = false. On some
diff --git a/vendor/golang.org/x/sys/cpu/proc_cpuinfo_linux.go b/vendor/golang.org/x/sys/cpu/proc_cpuinfo_linux.go
index d87bd6b3e..4cd64c704 100644
--- a/vendor/golang.org/x/sys/cpu/proc_cpuinfo_linux.go
+++ b/vendor/golang.org/x/sys/cpu/proc_cpuinfo_linux.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && arm64
-// +build linux,arm64
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/runtime_auxv_go121.go b/vendor/golang.org/x/sys/cpu/runtime_auxv_go121.go
index b975ea2a0..4c9788ea8 100644
--- a/vendor/golang.org/x/sys/cpu/runtime_auxv_go121.go
+++ b/vendor/golang.org/x/sys/cpu/runtime_auxv_go121.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build go1.21
-// +build go1.21
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/syscall_aix_gccgo.go b/vendor/golang.org/x/sys/cpu/syscall_aix_gccgo.go
index 96134157a..1b9ccb091 100644
--- a/vendor/golang.org/x/sys/cpu/syscall_aix_gccgo.go
+++ b/vendor/golang.org/x/sys/cpu/syscall_aix_gccgo.go
@@ -9,7 +9,6 @@
// gccgo's libgo and thus must not used a CGo method.
//go:build aix && gccgo
-// +build aix,gccgo
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/syscall_aix_ppc64_gc.go b/vendor/golang.org/x/sys/cpu/syscall_aix_ppc64_gc.go
index 904be42ff..e8b6cdbe9 100644
--- a/vendor/golang.org/x/sys/cpu/syscall_aix_ppc64_gc.go
+++ b/vendor/golang.org/x/sys/cpu/syscall_aix_ppc64_gc.go
@@ -7,7 +7,6 @@
// (See golang.org/issue/32102)
//go:build aix && ppc64 && gc
-// +build aix,ppc64,gc
package cpu
diff --git a/vendor/golang.org/x/sys/execabs/execabs_go118.go b/vendor/golang.org/x/sys/execabs/execabs_go118.go
index 2000064a8..5627d70e3 100644
--- a/vendor/golang.org/x/sys/execabs/execabs_go118.go
+++ b/vendor/golang.org/x/sys/execabs/execabs_go118.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build !go1.19
-// +build !go1.19
package execabs
diff --git a/vendor/golang.org/x/sys/execabs/execabs_go119.go b/vendor/golang.org/x/sys/execabs/execabs_go119.go
index f364b3418..d60ab1b41 100644
--- a/vendor/golang.org/x/sys/execabs/execabs_go119.go
+++ b/vendor/golang.org/x/sys/execabs/execabs_go119.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build go1.19
-// +build go1.19
package execabs
diff --git a/vendor/golang.org/x/sys/internal/unsafeheader/unsafeheader.go b/vendor/golang.org/x/sys/internal/unsafeheader/unsafeheader.go
deleted file mode 100644
index e07899b90..000000000
--- a/vendor/golang.org/x/sys/internal/unsafeheader/unsafeheader.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package unsafeheader contains header declarations for the Go runtime's
-// slice and string implementations.
-//
-// This package allows x/sys to use types equivalent to
-// reflect.SliceHeader and reflect.StringHeader without introducing
-// a dependency on the (relatively heavy) "reflect" package.
-package unsafeheader
-
-import (
- "unsafe"
-)
-
-// Slice is the runtime representation of a slice.
-// It cannot be used safely or portably and its representation may change in a later release.
-type Slice struct {
- Data unsafe.Pointer
- Len int
- Cap int
-}
-
-// String is the runtime representation of a string.
-// It cannot be used safely or portably and its representation may change in a later release.
-type String struct {
- Data unsafe.Pointer
- Len int
-}
diff --git a/vendor/golang.org/x/sys/plan9/pwd_go15_plan9.go b/vendor/golang.org/x/sys/plan9/pwd_go15_plan9.go
index c9b69937a..73687de74 100644
--- a/vendor/golang.org/x/sys/plan9/pwd_go15_plan9.go
+++ b/vendor/golang.org/x/sys/plan9/pwd_go15_plan9.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build go1.5
-// +build go1.5
package plan9
diff --git a/vendor/golang.org/x/sys/plan9/pwd_plan9.go b/vendor/golang.org/x/sys/plan9/pwd_plan9.go
index 98bf56b73..fb9458218 100644
--- a/vendor/golang.org/x/sys/plan9/pwd_plan9.go
+++ b/vendor/golang.org/x/sys/plan9/pwd_plan9.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build !go1.5
-// +build !go1.5
package plan9
diff --git a/vendor/golang.org/x/sys/plan9/race.go b/vendor/golang.org/x/sys/plan9/race.go
index 62377d2ff..c02d9ed33 100644
--- a/vendor/golang.org/x/sys/plan9/race.go
+++ b/vendor/golang.org/x/sys/plan9/race.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build plan9 && race
-// +build plan9,race
package plan9
diff --git a/vendor/golang.org/x/sys/plan9/race0.go b/vendor/golang.org/x/sys/plan9/race0.go
index f8da30876..7b15e15f6 100644
--- a/vendor/golang.org/x/sys/plan9/race0.go
+++ b/vendor/golang.org/x/sys/plan9/race0.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build plan9 && !race
-// +build plan9,!race
package plan9
diff --git a/vendor/golang.org/x/sys/plan9/str.go b/vendor/golang.org/x/sys/plan9/str.go
index 55fa8d025..ba3e8ff8a 100644
--- a/vendor/golang.org/x/sys/plan9/str.go
+++ b/vendor/golang.org/x/sys/plan9/str.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build plan9
-// +build plan9
package plan9
diff --git a/vendor/golang.org/x/sys/plan9/syscall.go b/vendor/golang.org/x/sys/plan9/syscall.go
index 67e5b0115..d631fd664 100644
--- a/vendor/golang.org/x/sys/plan9/syscall.go
+++ b/vendor/golang.org/x/sys/plan9/syscall.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build plan9
-// +build plan9
// Package plan9 contains an interface to the low-level operating system
// primitives. OS details vary depending on the underlying system, and
diff --git a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_386.go b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_386.go
index 3f40b9bd7..f780d5c80 100644
--- a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_386.go
+++ b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build plan9 && 386
-// +build plan9,386
package plan9
diff --git a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_amd64.go b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_amd64.go
index 0e6a96aa4..7de61065f 100644
--- a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_amd64.go
+++ b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build plan9 && amd64
-// +build plan9,amd64
package plan9
diff --git a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_arm.go b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_arm.go
index 244c501b7..ea85780f0 100644
--- a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_arm.go
+++ b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build plan9 && arm
-// +build plan9,arm
package plan9
diff --git a/vendor/golang.org/x/sys/unix/aliases.go b/vendor/golang.org/x/sys/unix/aliases.go
index abc89c104..b0e419857 100644
--- a/vendor/golang.org/x/sys/unix/aliases.go
+++ b/vendor/golang.org/x/sys/unix/aliases.go
@@ -2,9 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos) && go1.9
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
-// +build go1.9
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos
package unix
diff --git a/vendor/golang.org/x/sys/unix/asm_aix_ppc64.s b/vendor/golang.org/x/sys/unix/asm_aix_ppc64.s
index db9171c2e..269e173ca 100644
--- a/vendor/golang.org/x/sys/unix/asm_aix_ppc64.s
+++ b/vendor/golang.org/x/sys/unix/asm_aix_ppc64.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_386.s b/vendor/golang.org/x/sys/unix/asm_bsd_386.s
index e0fcd9b3d..a4fcef0e0 100644
--- a/vendor/golang.org/x/sys/unix/asm_bsd_386.s
+++ b/vendor/golang.org/x/sys/unix/asm_bsd_386.s
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (freebsd || netbsd || openbsd) && gc
-// +build freebsd netbsd openbsd
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_amd64.s b/vendor/golang.org/x/sys/unix/asm_bsd_amd64.s
index 2b99c349a..1e63615c5 100644
--- a/vendor/golang.org/x/sys/unix/asm_bsd_amd64.s
+++ b/vendor/golang.org/x/sys/unix/asm_bsd_amd64.s
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (darwin || dragonfly || freebsd || netbsd || openbsd) && gc
-// +build darwin dragonfly freebsd netbsd openbsd
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_arm.s b/vendor/golang.org/x/sys/unix/asm_bsd_arm.s
index d702d4adc..6496c3100 100644
--- a/vendor/golang.org/x/sys/unix/asm_bsd_arm.s
+++ b/vendor/golang.org/x/sys/unix/asm_bsd_arm.s
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (freebsd || netbsd || openbsd) && gc
-// +build freebsd netbsd openbsd
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_arm64.s b/vendor/golang.org/x/sys/unix/asm_bsd_arm64.s
index fe36a7391..4fd1f54da 100644
--- a/vendor/golang.org/x/sys/unix/asm_bsd_arm64.s
+++ b/vendor/golang.org/x/sys/unix/asm_bsd_arm64.s
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (darwin || freebsd || netbsd || openbsd) && gc
-// +build darwin freebsd netbsd openbsd
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_ppc64.s b/vendor/golang.org/x/sys/unix/asm_bsd_ppc64.s
index e5b9a8489..42f7eb9e4 100644
--- a/vendor/golang.org/x/sys/unix/asm_bsd_ppc64.s
+++ b/vendor/golang.org/x/sys/unix/asm_bsd_ppc64.s
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (darwin || freebsd || netbsd || openbsd) && gc
-// +build darwin freebsd netbsd openbsd
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_riscv64.s b/vendor/golang.org/x/sys/unix/asm_bsd_riscv64.s
index d560019ea..f8902667e 100644
--- a/vendor/golang.org/x/sys/unix/asm_bsd_riscv64.s
+++ b/vendor/golang.org/x/sys/unix/asm_bsd_riscv64.s
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (darwin || freebsd || netbsd || openbsd) && gc
-// +build darwin freebsd netbsd openbsd
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_386.s b/vendor/golang.org/x/sys/unix/asm_linux_386.s
index 8fd101d07..3b4734870 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_386.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_386.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_amd64.s b/vendor/golang.org/x/sys/unix/asm_linux_amd64.s
index 7ed38e43c..67e29f317 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_amd64.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_amd64.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_arm.s b/vendor/golang.org/x/sys/unix/asm_linux_arm.s
index 8ef1d5140..d6ae269ce 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_arm.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_arm.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_arm64.s b/vendor/golang.org/x/sys/unix/asm_linux_arm64.s
index 98ae02760..01e5e253c 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_arm64.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_arm64.s
@@ -3,9 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && arm64 && gc
-// +build linux
-// +build arm64
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_loong64.s b/vendor/golang.org/x/sys/unix/asm_linux_loong64.s
index 565357288..2abf12f6e 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_loong64.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_loong64.s
@@ -3,9 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && loong64 && gc
-// +build linux
-// +build loong64
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_mips64x.s b/vendor/golang.org/x/sys/unix/asm_linux_mips64x.s
index 21231d2ce..f84bae712 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_mips64x.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_mips64x.s
@@ -3,9 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (mips64 || mips64le) && gc
-// +build linux
-// +build mips64 mips64le
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_mipsx.s b/vendor/golang.org/x/sys/unix/asm_linux_mipsx.s
index 6783b26c6..f08f62807 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_mipsx.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_mipsx.s
@@ -3,9 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (mips || mipsle) && gc
-// +build linux
-// +build mips mipsle
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_ppc64x.s b/vendor/golang.org/x/sys/unix/asm_linux_ppc64x.s
index 19d498934..bdfc024d2 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_ppc64x.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_ppc64x.s
@@ -3,9 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (ppc64 || ppc64le) && gc
-// +build linux
-// +build ppc64 ppc64le
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_riscv64.s b/vendor/golang.org/x/sys/unix/asm_linux_riscv64.s
index e42eb81d5..2e8c99612 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_riscv64.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_riscv64.s
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build riscv64 && gc
-// +build riscv64
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_linux_s390x.s b/vendor/golang.org/x/sys/unix/asm_linux_s390x.s
index c46aab339..2c394b11e 100644
--- a/vendor/golang.org/x/sys/unix/asm_linux_s390x.s
+++ b/vendor/golang.org/x/sys/unix/asm_linux_s390x.s
@@ -3,9 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && s390x && gc
-// +build linux
-// +build s390x
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_openbsd_mips64.s b/vendor/golang.org/x/sys/unix/asm_openbsd_mips64.s
index 5e7a1169c..fab586a2c 100644
--- a/vendor/golang.org/x/sys/unix/asm_openbsd_mips64.s
+++ b/vendor/golang.org/x/sys/unix/asm_openbsd_mips64.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_solaris_amd64.s b/vendor/golang.org/x/sys/unix/asm_solaris_amd64.s
index f8c5394c1..f949ec547 100644
--- a/vendor/golang.org/x/sys/unix/asm_solaris_amd64.s
+++ b/vendor/golang.org/x/sys/unix/asm_solaris_amd64.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gc
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/asm_zos_s390x.s b/vendor/golang.org/x/sys/unix/asm_zos_s390x.s
index 3b54e1858..2f67ba86d 100644
--- a/vendor/golang.org/x/sys/unix/asm_zos_s390x.s
+++ b/vendor/golang.org/x/sys/unix/asm_zos_s390x.s
@@ -3,9 +3,6 @@
// license that can be found in the LICENSE file.
//go:build zos && s390x && gc
-// +build zos
-// +build s390x
-// +build gc
#include "textflag.h"
diff --git a/vendor/golang.org/x/sys/unix/cap_freebsd.go b/vendor/golang.org/x/sys/unix/cap_freebsd.go
index 0b7c6adb8..a08657890 100644
--- a/vendor/golang.org/x/sys/unix/cap_freebsd.go
+++ b/vendor/golang.org/x/sys/unix/cap_freebsd.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build freebsd
-// +build freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/constants.go b/vendor/golang.org/x/sys/unix/constants.go
index 394a3965b..6fb7cb77d 100644
--- a/vendor/golang.org/x/sys/unix/constants.go
+++ b/vendor/golang.org/x/sys/unix/constants.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
package unix
diff --git a/vendor/golang.org/x/sys/unix/dev_aix_ppc.go b/vendor/golang.org/x/sys/unix/dev_aix_ppc.go
index 65a998508..d78513461 100644
--- a/vendor/golang.org/x/sys/unix/dev_aix_ppc.go
+++ b/vendor/golang.org/x/sys/unix/dev_aix_ppc.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix && ppc
-// +build aix,ppc
// Functions to access/create device major and minor numbers matching the
// encoding used by AIX.
diff --git a/vendor/golang.org/x/sys/unix/dev_aix_ppc64.go b/vendor/golang.org/x/sys/unix/dev_aix_ppc64.go
index 8fc08ad0a..623a5e697 100644
--- a/vendor/golang.org/x/sys/unix/dev_aix_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/dev_aix_ppc64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix && ppc64
-// +build aix,ppc64
// Functions to access/create device major and minor numbers matching the
// encoding used AIX.
diff --git a/vendor/golang.org/x/sys/unix/dev_zos.go b/vendor/golang.org/x/sys/unix/dev_zos.go
index a388e59a0..bb6a64fe9 100644
--- a/vendor/golang.org/x/sys/unix/dev_zos.go
+++ b/vendor/golang.org/x/sys/unix/dev_zos.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build zos && s390x
-// +build zos,s390x
// Functions to access/create device major and minor numbers matching the
// encoding used by z/OS.
diff --git a/vendor/golang.org/x/sys/unix/dirent.go b/vendor/golang.org/x/sys/unix/dirent.go
index 2499f977b..1ebf11782 100644
--- a/vendor/golang.org/x/sys/unix/dirent.go
+++ b/vendor/golang.org/x/sys/unix/dirent.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
package unix
diff --git a/vendor/golang.org/x/sys/unix/endian_big.go b/vendor/golang.org/x/sys/unix/endian_big.go
index a52026557..1095fd31d 100644
--- a/vendor/golang.org/x/sys/unix/endian_big.go
+++ b/vendor/golang.org/x/sys/unix/endian_big.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//
//go:build armbe || arm64be || m68k || mips || mips64 || mips64p32 || ppc || ppc64 || s390 || s390x || shbe || sparc || sparc64
-// +build armbe arm64be m68k mips mips64 mips64p32 ppc ppc64 s390 s390x shbe sparc sparc64
package unix
diff --git a/vendor/golang.org/x/sys/unix/endian_little.go b/vendor/golang.org/x/sys/unix/endian_little.go
index b0f2bc4ae..b9f0e277b 100644
--- a/vendor/golang.org/x/sys/unix/endian_little.go
+++ b/vendor/golang.org/x/sys/unix/endian_little.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//
//go:build 386 || amd64 || amd64p32 || alpha || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || nios2 || ppc64le || riscv || riscv64 || sh
-// +build 386 amd64 amd64p32 alpha arm arm64 loong64 mipsle mips64le mips64p32le nios2 ppc64le riscv riscv64 sh
package unix
diff --git a/vendor/golang.org/x/sys/unix/env_unix.go b/vendor/golang.org/x/sys/unix/env_unix.go
index 29ccc4d13..a96da71f4 100644
--- a/vendor/golang.org/x/sys/unix/env_unix.go
+++ b/vendor/golang.org/x/sys/unix/env_unix.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
// Unix environment variables.
diff --git a/vendor/golang.org/x/sys/unix/epoll_zos.go b/vendor/golang.org/x/sys/unix/epoll_zos.go
index cedaf7e02..7753fddea 100644
--- a/vendor/golang.org/x/sys/unix/epoll_zos.go
+++ b/vendor/golang.org/x/sys/unix/epoll_zos.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build zos && s390x
-// +build zos,s390x
package unix
diff --git a/vendor/golang.org/x/sys/unix/fcntl.go b/vendor/golang.org/x/sys/unix/fcntl.go
index e9b991258..6200876fb 100644
--- a/vendor/golang.org/x/sys/unix/fcntl.go
+++ b/vendor/golang.org/x/sys/unix/fcntl.go
@@ -2,8 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build dragonfly || freebsd || linux || netbsd || openbsd
-// +build dragonfly freebsd linux netbsd openbsd
+//go:build dragonfly || freebsd || linux || netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/fcntl_linux_32bit.go b/vendor/golang.org/x/sys/unix/fcntl_linux_32bit.go
index 29d44808b..13b4acd5c 100644
--- a/vendor/golang.org/x/sys/unix/fcntl_linux_32bit.go
+++ b/vendor/golang.org/x/sys/unix/fcntl_linux_32bit.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (linux && 386) || (linux && arm) || (linux && mips) || (linux && mipsle) || (linux && ppc)
-// +build linux,386 linux,arm linux,mips linux,mipsle linux,ppc
package unix
diff --git a/vendor/golang.org/x/sys/unix/fdset.go b/vendor/golang.org/x/sys/unix/fdset.go
index a8068f94f..9e83d18cd 100644
--- a/vendor/golang.org/x/sys/unix/fdset.go
+++ b/vendor/golang.org/x/sys/unix/fdset.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
package unix
diff --git a/vendor/golang.org/x/sys/unix/fstatfs_zos.go b/vendor/golang.org/x/sys/unix/fstatfs_zos.go
index e377cc9f4..c8bde601e 100644
--- a/vendor/golang.org/x/sys/unix/fstatfs_zos.go
+++ b/vendor/golang.org/x/sys/unix/fstatfs_zos.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build zos && s390x
-// +build zos,s390x
package unix
diff --git a/vendor/golang.org/x/sys/unix/gccgo.go b/vendor/golang.org/x/sys/unix/gccgo.go
index b06f52d74..aca5721dd 100644
--- a/vendor/golang.org/x/sys/unix/gccgo.go
+++ b/vendor/golang.org/x/sys/unix/gccgo.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gccgo && !aix && !hurd
-// +build gccgo,!aix,!hurd
package unix
diff --git a/vendor/golang.org/x/sys/unix/gccgo_c.c b/vendor/golang.org/x/sys/unix/gccgo_c.c
index f98a1c542..d468b7b47 100644
--- a/vendor/golang.org/x/sys/unix/gccgo_c.c
+++ b/vendor/golang.org/x/sys/unix/gccgo_c.c
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gccgo && !aix && !hurd
-// +build gccgo,!aix,!hurd
#include
#include
diff --git a/vendor/golang.org/x/sys/unix/gccgo_linux_amd64.go b/vendor/golang.org/x/sys/unix/gccgo_linux_amd64.go
index e60e49a3d..972d61bd7 100644
--- a/vendor/golang.org/x/sys/unix/gccgo_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/gccgo_linux_amd64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build gccgo && linux && amd64
-// +build gccgo,linux,amd64
package unix
diff --git a/vendor/golang.org/x/sys/unix/ifreq_linux.go b/vendor/golang.org/x/sys/unix/ifreq_linux.go
index 15721a510..848840ae4 100644
--- a/vendor/golang.org/x/sys/unix/ifreq_linux.go
+++ b/vendor/golang.org/x/sys/unix/ifreq_linux.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux
-// +build linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ioctl_linux.go b/vendor/golang.org/x/sys/unix/ioctl_linux.go
index 0d12c0851..dbe680eab 100644
--- a/vendor/golang.org/x/sys/unix/ioctl_linux.go
+++ b/vendor/golang.org/x/sys/unix/ioctl_linux.go
@@ -231,3 +231,8 @@ func IoctlLoopGetStatus64(fd int) (*LoopInfo64, error) {
func IoctlLoopSetStatus64(fd int, value *LoopInfo64) error {
return ioctlPtr(fd, LOOP_SET_STATUS64, unsafe.Pointer(value))
}
+
+// IoctlLoopConfigure configures all loop device parameters in a single step
+func IoctlLoopConfigure(fd int, value *LoopConfig) error {
+ return ioctlPtr(fd, LOOP_CONFIGURE, unsafe.Pointer(value))
+}
diff --git a/vendor/golang.org/x/sys/unix/ioctl_signed.go b/vendor/golang.org/x/sys/unix/ioctl_signed.go
index 7def9580e..5b0759bd8 100644
--- a/vendor/golang.org/x/sys/unix/ioctl_signed.go
+++ b/vendor/golang.org/x/sys/unix/ioctl_signed.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || solaris
-// +build aix solaris
package unix
diff --git a/vendor/golang.org/x/sys/unix/ioctl_unsigned.go b/vendor/golang.org/x/sys/unix/ioctl_unsigned.go
index 649913d1e..20f470b9d 100644
--- a/vendor/golang.org/x/sys/unix/ioctl_unsigned.go
+++ b/vendor/golang.org/x/sys/unix/ioctl_unsigned.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build darwin || dragonfly || freebsd || hurd || linux || netbsd || openbsd
-// +build darwin dragonfly freebsd hurd linux netbsd openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ioctl_zos.go b/vendor/golang.org/x/sys/unix/ioctl_zos.go
index cdc21bf76..c8b2a750f 100644
--- a/vendor/golang.org/x/sys/unix/ioctl_zos.go
+++ b/vendor/golang.org/x/sys/unix/ioctl_zos.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build zos && s390x
-// +build zos,s390x
package unix
diff --git a/vendor/golang.org/x/sys/unix/mkerrors.sh b/vendor/golang.org/x/sys/unix/mkerrors.sh
index 8f775fafa..fdcaa974d 100644
--- a/vendor/golang.org/x/sys/unix/mkerrors.sh
+++ b/vendor/golang.org/x/sys/unix/mkerrors.sh
@@ -248,6 +248,7 @@ struct ltchars {
#include
#include
#include
+#include
#include
#include
#include
@@ -283,10 +284,6 @@ struct ltchars {
#include
#endif
-#ifndef MSG_FASTOPEN
-#define MSG_FASTOPEN 0x20000000
-#endif
-
#ifndef PTRACE_GETREGS
#define PTRACE_GETREGS 0xc
#endif
@@ -295,14 +292,6 @@ struct ltchars {
#define PTRACE_SETREGS 0xd
#endif
-#ifndef SOL_NETLINK
-#define SOL_NETLINK 270
-#endif
-
-#ifndef SOL_SMC
-#define SOL_SMC 286
-#endif
-
#ifdef SOL_BLUETOOTH
// SPARC includes this in /usr/include/sparc64-linux-gnu/bits/socket.h
// but it is already in bluetooth_linux.go
@@ -319,10 +308,23 @@ struct ltchars {
#undef TIPC_WAIT_FOREVER
#define TIPC_WAIT_FOREVER 0xffffffff
-// Copied from linux/l2tp.h
-// Including linux/l2tp.h here causes conflicts between linux/in.h
-// and netinet/in.h included via net/route.h above.
-#define IPPROTO_L2TP 115
+// Copied from linux/netfilter/nf_nat.h
+// Including linux/netfilter/nf_nat.h here causes conflicts between linux/in.h
+// and netinet/in.h.
+#define NF_NAT_RANGE_MAP_IPS (1 << 0)
+#define NF_NAT_RANGE_PROTO_SPECIFIED (1 << 1)
+#define NF_NAT_RANGE_PROTO_RANDOM (1 << 2)
+#define NF_NAT_RANGE_PERSISTENT (1 << 3)
+#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4)
+#define NF_NAT_RANGE_PROTO_OFFSET (1 << 5)
+#define NF_NAT_RANGE_NETMAP (1 << 6)
+#define NF_NAT_RANGE_PROTO_RANDOM_ALL \
+ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
+#define NF_NAT_RANGE_MASK \
+ (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \
+ NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \
+ NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \
+ NF_NAT_RANGE_NETMAP)
// Copied from linux/hid.h.
// Keep in sync with the size of the referenced fields.
@@ -519,6 +521,7 @@ ccflags="$@"
$2 ~ /^LOCK_(SH|EX|NB|UN)$/ ||
$2 ~ /^LO_(KEY|NAME)_SIZE$/ ||
$2 ~ /^LOOP_(CLR|CTL|GET|SET)_/ ||
+ $2 == "LOOP_CONFIGURE" ||
$2 ~ /^(AF|SOCK|SO|SOL|IPPROTO|IP|IPV6|TCP|MCAST|EVFILT|NOTE|SHUT|PROT|MAP|MREMAP|MFD|T?PACKET|MSG|SCM|MCL|DT|MADV|PR|LOCAL|TCPOPT|UDP)_/ ||
$2 ~ /^NFC_(GENL|PROTO|COMM|RF|SE|DIRECTION|LLCP|SOCKPROTO)_/ ||
$2 ~ /^NFC_.*_(MAX)?SIZE$/ ||
@@ -560,7 +563,7 @@ ccflags="$@"
$2 ~ /^RLIMIT_(AS|CORE|CPU|DATA|FSIZE|LOCKS|MEMLOCK|MSGQUEUE|NICE|NOFILE|NPROC|RSS|RTPRIO|RTTIME|SIGPENDING|STACK)|RLIM_INFINITY/ ||
$2 ~ /^PRIO_(PROCESS|PGRP|USER)/ ||
$2 ~ /^CLONE_[A-Z_]+/ ||
- $2 !~ /^(BPF_TIMEVAL|BPF_FIB_LOOKUP_[A-Z]+)$/ &&
+ $2 !~ /^(BPF_TIMEVAL|BPF_FIB_LOOKUP_[A-Z]+|BPF_F_LINK)$/ &&
$2 ~ /^(BPF|DLT)_/ ||
$2 ~ /^AUDIT_/ ||
$2 ~ /^(CLOCK|TIMER)_/ ||
@@ -581,8 +584,9 @@ ccflags="$@"
$2 ~ /^KEY_(SPEC|REQKEY_DEFL)_/ ||
$2 ~ /^KEYCTL_/ ||
$2 ~ /^PERF_/ ||
- $2 ~ /^SECCOMP_MODE_/ ||
+ $2 ~ /^SECCOMP_/ ||
$2 ~ /^SEEK_/ ||
+ $2 ~ /^SCHED_/ ||
$2 ~ /^SPLICE_/ ||
$2 ~ /^SYNC_FILE_RANGE_/ ||
$2 !~ /IOC_MAGIC/ &&
@@ -601,6 +605,9 @@ ccflags="$@"
$2 ~ /^FSOPT_/ ||
$2 ~ /^WDIO[CFS]_/ ||
$2 ~ /^NFN/ ||
+ $2 !~ /^NFT_META_IIFTYPE/ &&
+ $2 ~ /^NFT_/ ||
+ $2 ~ /^NF_NAT_/ ||
$2 ~ /^XDP_/ ||
$2 ~ /^RWF_/ ||
$2 ~ /^(HDIO|WIN|SMART)_/ ||
@@ -662,7 +669,6 @@ echo '// mkerrors.sh' "$@"
echo '// Code generated by the command above; see README.md. DO NOT EDIT.'
echo
echo "//go:build ${GOARCH} && ${GOOS}"
-echo "// +build ${GOARCH},${GOOS}"
echo
go tool cgo -godefs -- "$@" _const.go >_error.out
cat _error.out | grep -vf _error.grep | grep -vf _signal.grep
diff --git a/vendor/golang.org/x/sys/unix/mmap_nomremap.go b/vendor/golang.org/x/sys/unix/mmap_nomremap.go
index ca0513632..4b68e5978 100644
--- a/vendor/golang.org/x/sys/unix/mmap_nomremap.go
+++ b/vendor/golang.org/x/sys/unix/mmap_nomremap.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || openbsd || solaris
-// +build aix darwin dragonfly freebsd openbsd solaris
package unix
diff --git a/vendor/golang.org/x/sys/unix/mremap.go b/vendor/golang.org/x/sys/unix/mremap.go
index fa93d0aa9..fd45fe529 100644
--- a/vendor/golang.org/x/sys/unix/mremap.go
+++ b/vendor/golang.org/x/sys/unix/mremap.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux || netbsd
-// +build linux netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/pagesize_unix.go b/vendor/golang.org/x/sys/unix/pagesize_unix.go
index 53f1b4c5b..4d0a3430e 100644
--- a/vendor/golang.org/x/sys/unix/pagesize_unix.go
+++ b/vendor/golang.org/x/sys/unix/pagesize_unix.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
// For Unix, get the pagesize from the runtime.
diff --git a/vendor/golang.org/x/sys/unix/pledge_openbsd.go b/vendor/golang.org/x/sys/unix/pledge_openbsd.go
index eb48294b2..6a09af53e 100644
--- a/vendor/golang.org/x/sys/unix/pledge_openbsd.go
+++ b/vendor/golang.org/x/sys/unix/pledge_openbsd.go
@@ -8,54 +8,31 @@ import (
"errors"
"fmt"
"strconv"
- "syscall"
- "unsafe"
)
// Pledge implements the pledge syscall.
//
-// The pledge syscall does not accept execpromises on OpenBSD releases
-// before 6.3.
-//
-// execpromises must be empty when Pledge is called on OpenBSD
-// releases predating 6.3, otherwise an error will be returned.
+// This changes both the promises and execpromises; use PledgePromises or
+// PledgeExecpromises to only change the promises or execpromises
+// respectively.
//
// For more information see pledge(2).
func Pledge(promises, execpromises string) error {
- maj, min, err := majmin()
+ if err := pledgeAvailable(); err != nil {
+ return err
+ }
+
+ pptr, err := BytePtrFromString(promises)
if err != nil {
return err
}
- err = pledgeAvailable(maj, min, execpromises)
+ exptr, err := BytePtrFromString(execpromises)
if err != nil {
return err
}
- pptr, err := syscall.BytePtrFromString(promises)
- if err != nil {
- return err
- }
-
- // This variable will hold either a nil unsafe.Pointer or
- // an unsafe.Pointer to a string (execpromises).
- var expr unsafe.Pointer
-
- // If we're running on OpenBSD > 6.2, pass execpromises to the syscall.
- if maj > 6 || (maj == 6 && min > 2) {
- exptr, err := syscall.BytePtrFromString(execpromises)
- if err != nil {
- return err
- }
- expr = unsafe.Pointer(exptr)
- }
-
- _, _, e := syscall.Syscall(SYS_PLEDGE, uintptr(unsafe.Pointer(pptr)), uintptr(expr), 0)
- if e != 0 {
- return e
- }
-
- return nil
+ return pledge(pptr, exptr)
}
// PledgePromises implements the pledge syscall.
@@ -64,30 +41,16 @@ func Pledge(promises, execpromises string) error {
//
// For more information see pledge(2).
func PledgePromises(promises string) error {
- maj, min, err := majmin()
+ if err := pledgeAvailable(); err != nil {
+ return err
+ }
+
+ pptr, err := BytePtrFromString(promises)
if err != nil {
return err
}
- err = pledgeAvailable(maj, min, "")
- if err != nil {
- return err
- }
-
- // This variable holds the execpromises and is always nil.
- var expr unsafe.Pointer
-
- pptr, err := syscall.BytePtrFromString(promises)
- if err != nil {
- return err
- }
-
- _, _, e := syscall.Syscall(SYS_PLEDGE, uintptr(unsafe.Pointer(pptr)), uintptr(expr), 0)
- if e != 0 {
- return e
- }
-
- return nil
+ return pledge(pptr, nil)
}
// PledgeExecpromises implements the pledge syscall.
@@ -96,30 +59,16 @@ func PledgePromises(promises string) error {
//
// For more information see pledge(2).
func PledgeExecpromises(execpromises string) error {
- maj, min, err := majmin()
+ if err := pledgeAvailable(); err != nil {
+ return err
+ }
+
+ exptr, err := BytePtrFromString(execpromises)
if err != nil {
return err
}
- err = pledgeAvailable(maj, min, execpromises)
- if err != nil {
- return err
- }
-
- // This variable holds the promises and is always nil.
- var pptr unsafe.Pointer
-
- exptr, err := syscall.BytePtrFromString(execpromises)
- if err != nil {
- return err
- }
-
- _, _, e := syscall.Syscall(SYS_PLEDGE, uintptr(pptr), uintptr(unsafe.Pointer(exptr)), 0)
- if e != 0 {
- return e
- }
-
- return nil
+ return pledge(nil, exptr)
}
// majmin returns major and minor version number for an OpenBSD system.
@@ -147,16 +96,15 @@ func majmin() (major int, minor int, err error) {
// pledgeAvailable checks for availability of the pledge(2) syscall
// based on the running OpenBSD version.
-func pledgeAvailable(maj, min int, execpromises string) error {
- // If OpenBSD <= 5.9, pledge is not available.
- if (maj == 5 && min != 9) || maj < 5 {
- return fmt.Errorf("pledge syscall is not available on OpenBSD %d.%d", maj, min)
+func pledgeAvailable() error {
+ maj, min, err := majmin()
+ if err != nil {
+ return err
}
- // If OpenBSD <= 6.2 and execpromises is not empty,
- // return an error - execpromises is not available before 6.3
- if (maj < 6 || (maj == 6 && min <= 2)) && execpromises != "" {
- return fmt.Errorf("cannot use execpromises on OpenBSD %d.%d", maj, min)
+ // Require OpenBSD 6.4 as a minimum.
+ if maj < 6 || (maj == 6 && min <= 3) {
+ return fmt.Errorf("cannot call Pledge on OpenBSD %d.%d", maj, min)
}
return nil
diff --git a/vendor/golang.org/x/sys/unix/ptrace_darwin.go b/vendor/golang.org/x/sys/unix/ptrace_darwin.go
index 39dba6ca6..3f0975f3d 100644
--- a/vendor/golang.org/x/sys/unix/ptrace_darwin.go
+++ b/vendor/golang.org/x/sys/unix/ptrace_darwin.go
@@ -3,16 +3,9 @@
// license that can be found in the LICENSE file.
//go:build darwin && !ios
-// +build darwin,!ios
package unix
-import "unsafe"
-
func ptrace(request int, pid int, addr uintptr, data uintptr) error {
return ptrace1(request, pid, addr, data)
}
-
-func ptracePtr(request int, pid int, addr uintptr, data unsafe.Pointer) error {
- return ptrace1Ptr(request, pid, addr, data)
-}
diff --git a/vendor/golang.org/x/sys/unix/ptrace_ios.go b/vendor/golang.org/x/sys/unix/ptrace_ios.go
index 9ea66330a..a4d35db5d 100644
--- a/vendor/golang.org/x/sys/unix/ptrace_ios.go
+++ b/vendor/golang.org/x/sys/unix/ptrace_ios.go
@@ -3,16 +3,9 @@
// license that can be found in the LICENSE file.
//go:build ios
-// +build ios
package unix
-import "unsafe"
-
func ptrace(request int, pid int, addr uintptr, data uintptr) (err error) {
return ENOTSUP
}
-
-func ptracePtr(request int, pid int, addr uintptr, data unsafe.Pointer) (err error) {
- return ENOTSUP
-}
diff --git a/vendor/golang.org/x/sys/unix/race.go b/vendor/golang.org/x/sys/unix/race.go
index 6f6c5fec5..714d2aae7 100644
--- a/vendor/golang.org/x/sys/unix/race.go
+++ b/vendor/golang.org/x/sys/unix/race.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (darwin && race) || (linux && race) || (freebsd && race)
-// +build darwin,race linux,race freebsd,race
package unix
diff --git a/vendor/golang.org/x/sys/unix/race0.go b/vendor/golang.org/x/sys/unix/race0.go
index 706e1322a..4a9f6634c 100644
--- a/vendor/golang.org/x/sys/unix/race0.go
+++ b/vendor/golang.org/x/sys/unix/race0.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || (darwin && !race) || (linux && !race) || (freebsd && !race) || netbsd || openbsd || solaris || dragonfly || zos
-// +build aix darwin,!race linux,!race freebsd,!race netbsd openbsd solaris dragonfly zos
package unix
diff --git a/vendor/golang.org/x/sys/unix/readdirent_getdents.go b/vendor/golang.org/x/sys/unix/readdirent_getdents.go
index 4d6257569..dbd2b6ccb 100644
--- a/vendor/golang.org/x/sys/unix/readdirent_getdents.go
+++ b/vendor/golang.org/x/sys/unix/readdirent_getdents.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || dragonfly || freebsd || linux || netbsd || openbsd
-// +build aix dragonfly freebsd linux netbsd openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/readdirent_getdirentries.go b/vendor/golang.org/x/sys/unix/readdirent_getdirentries.go
index 2a4ba47c4..130398b6b 100644
--- a/vendor/golang.org/x/sys/unix/readdirent_getdirentries.go
+++ b/vendor/golang.org/x/sys/unix/readdirent_getdirentries.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build darwin
-// +build darwin
package unix
diff --git a/vendor/golang.org/x/sys/unix/sockcmsg_unix.go b/vendor/golang.org/x/sys/unix/sockcmsg_unix.go
index 3865943f6..c3a62dbb1 100644
--- a/vendor/golang.org/x/sys/unix/sockcmsg_unix.go
+++ b/vendor/golang.org/x/sys/unix/sockcmsg_unix.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
// Socket control messages
diff --git a/vendor/golang.org/x/sys/unix/sockcmsg_unix_other.go b/vendor/golang.org/x/sys/unix/sockcmsg_unix_other.go
index 0840fe4a5..4a1eab37e 100644
--- a/vendor/golang.org/x/sys/unix/sockcmsg_unix_other.go
+++ b/vendor/golang.org/x/sys/unix/sockcmsg_unix_other.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || freebsd || linux || netbsd || openbsd || solaris || zos
-// +build aix darwin freebsd linux netbsd openbsd solaris zos
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall.go b/vendor/golang.org/x/sys/unix/syscall.go
index 63e8c8383..5ea74da98 100644
--- a/vendor/golang.org/x/sys/unix/syscall.go
+++ b/vendor/golang.org/x/sys/unix/syscall.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
// Package unix contains an interface to the low-level operating system
// primitives. OS details vary depending on the underlying system, and
diff --git a/vendor/golang.org/x/sys/unix/syscall_aix.go b/vendor/golang.org/x/sys/unix/syscall_aix.go
index 9a6e5acac..67ce6cef2 100644
--- a/vendor/golang.org/x/sys/unix/syscall_aix.go
+++ b/vendor/golang.org/x/sys/unix/syscall_aix.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix
-// +build aix
// Aix system calls.
// This file is compiled as ordinary Go code,
@@ -107,7 +106,8 @@ func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, _Socklen, error) {
if n > 0 {
sl += _Socklen(n) + 1
}
- if sa.raw.Path[0] == '@' {
+ if sa.raw.Path[0] == '@' || (sa.raw.Path[0] == 0 && sl > 3) {
+ // Check sl > 3 so we don't change unnamed socket behavior.
sa.raw.Path[0] = 0
// Don't count trailing NUL for abstract address.
sl--
@@ -487,8 +487,6 @@ func Fsync(fd int) error {
//sys Unlinkat(dirfd int, path string, flags int) (err error)
//sys Ustat(dev int, ubuf *Ustat_t) (err error)
//sys write(fd int, p []byte) (n int, err error)
-//sys readlen(fd int, p *byte, np int) (n int, err error) = read
-//sys writelen(fd int, p *byte, np int) (n int, err error) = write
//sys Dup2(oldfd int, newfd int) (err error)
//sys Fadvise(fd int, offset int64, length int64, advice int) (err error) = posix_fadvise64
diff --git a/vendor/golang.org/x/sys/unix/syscall_aix_ppc.go b/vendor/golang.org/x/sys/unix/syscall_aix_ppc.go
index f2871fa95..1fdaa4760 100644
--- a/vendor/golang.org/x/sys/unix/syscall_aix_ppc.go
+++ b/vendor/golang.org/x/sys/unix/syscall_aix_ppc.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix && ppc
-// +build aix,ppc
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_aix_ppc64.go b/vendor/golang.org/x/sys/unix/syscall_aix_ppc64.go
index 75718ec0f..c87f9a9f4 100644
--- a/vendor/golang.org/x/sys/unix/syscall_aix_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_aix_ppc64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix && ppc64
-// +build aix,ppc64
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_bsd.go b/vendor/golang.org/x/sys/unix/syscall_bsd.go
index 4217de518..a00c3e545 100644
--- a/vendor/golang.org/x/sys/unix/syscall_bsd.go
+++ b/vendor/golang.org/x/sys/unix/syscall_bsd.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build darwin || dragonfly || freebsd || netbsd || openbsd
-// +build darwin dragonfly freebsd netbsd openbsd
// BSD system call wrappers shared by *BSD based systems
// including OS X (Darwin) and FreeBSD. Like the other
@@ -317,7 +316,7 @@ func GetsockoptString(fd, level, opt int) (string, error) {
if err != nil {
return "", err
}
- return string(buf[:vallen-1]), nil
+ return ByteSliceToString(buf[:vallen]), nil
}
//sys recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Socklen) (n int, err error)
diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.go b/vendor/golang.org/x/sys/unix/syscall_darwin.go
index 135cc3cd7..59542a897 100644
--- a/vendor/golang.org/x/sys/unix/syscall_darwin.go
+++ b/vendor/golang.org/x/sys/unix/syscall_darwin.go
@@ -644,189 +644,3 @@ func SysctlKinfoProcSlice(name string, args ...int) ([]KinfoProc, error) {
//sys write(fd int, p []byte) (n int, err error)
//sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error)
//sys munmap(addr uintptr, length uintptr) (err error)
-//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ
-//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE
-
-/*
- * Unimplemented
- */
-// Profil
-// Sigaction
-// Sigprocmask
-// Getlogin
-// Sigpending
-// Sigaltstack
-// Ioctl
-// Reboot
-// Execve
-// Vfork
-// Sbrk
-// Sstk
-// Ovadvise
-// Mincore
-// Setitimer
-// Swapon
-// Select
-// Sigsuspend
-// Readv
-// Writev
-// Nfssvc
-// Getfh
-// Quotactl
-// Csops
-// Waitid
-// Add_profil
-// Kdebug_trace
-// Sigreturn
-// Atsocket
-// Kqueue_from_portset_np
-// Kqueue_portset
-// Getattrlist
-// Getdirentriesattr
-// Searchfs
-// Delete
-// Copyfile
-// Watchevent
-// Waitevent
-// Modwatch
-// Fsctl
-// Initgroups
-// Posix_spawn
-// Nfsclnt
-// Fhopen
-// Minherit
-// Semsys
-// Msgsys
-// Shmsys
-// Semctl
-// Semget
-// Semop
-// Msgctl
-// Msgget
-// Msgsnd
-// Msgrcv
-// Shm_open
-// Shm_unlink
-// Sem_open
-// Sem_close
-// Sem_unlink
-// Sem_wait
-// Sem_trywait
-// Sem_post
-// Sem_getvalue
-// Sem_init
-// Sem_destroy
-// Open_extended
-// Umask_extended
-// Stat_extended
-// Lstat_extended
-// Fstat_extended
-// Chmod_extended
-// Fchmod_extended
-// Access_extended
-// Settid
-// Gettid
-// Setsgroups
-// Getsgroups
-// Setwgroups
-// Getwgroups
-// Mkfifo_extended
-// Mkdir_extended
-// Identitysvc
-// Shared_region_check_np
-// Shared_region_map_np
-// __pthread_mutex_destroy
-// __pthread_mutex_init
-// __pthread_mutex_lock
-// __pthread_mutex_trylock
-// __pthread_mutex_unlock
-// __pthread_cond_init
-// __pthread_cond_destroy
-// __pthread_cond_broadcast
-// __pthread_cond_signal
-// Setsid_with_pid
-// __pthread_cond_timedwait
-// Aio_fsync
-// Aio_return
-// Aio_suspend
-// Aio_cancel
-// Aio_error
-// Aio_read
-// Aio_write
-// Lio_listio
-// __pthread_cond_wait
-// Iopolicysys
-// __pthread_kill
-// __pthread_sigmask
-// __sigwait
-// __disable_threadsignal
-// __pthread_markcancel
-// __pthread_canceled
-// __semwait_signal
-// Proc_info
-// sendfile
-// Stat64_extended
-// Lstat64_extended
-// Fstat64_extended
-// __pthread_chdir
-// __pthread_fchdir
-// Audit
-// Auditon
-// Getauid
-// Setauid
-// Getaudit
-// Setaudit
-// Getaudit_addr
-// Setaudit_addr
-// Auditctl
-// Bsdthread_create
-// Bsdthread_terminate
-// Stack_snapshot
-// Bsdthread_register
-// Workq_open
-// Workq_ops
-// __mac_execve
-// __mac_syscall
-// __mac_get_file
-// __mac_set_file
-// __mac_get_link
-// __mac_set_link
-// __mac_get_proc
-// __mac_set_proc
-// __mac_get_fd
-// __mac_set_fd
-// __mac_get_pid
-// __mac_get_lcid
-// __mac_get_lctx
-// __mac_set_lctx
-// Setlcid
-// Read_nocancel
-// Write_nocancel
-// Open_nocancel
-// Close_nocancel
-// Wait4_nocancel
-// Recvmsg_nocancel
-// Sendmsg_nocancel
-// Recvfrom_nocancel
-// Accept_nocancel
-// Fcntl_nocancel
-// Select_nocancel
-// Fsync_nocancel
-// Connect_nocancel
-// Sigsuspend_nocancel
-// Readv_nocancel
-// Writev_nocancel
-// Sendto_nocancel
-// Pread_nocancel
-// Pwrite_nocancel
-// Waitid_nocancel
-// Poll_nocancel
-// Msgsnd_nocancel
-// Msgrcv_nocancel
-// Sem_wait_nocancel
-// Aio_suspend_nocancel
-// __sigwait_nocancel
-// __semwait_signal_nocancel
-// __mac_mount
-// __mac_get_mount
-// __mac_getfsstat
diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/syscall_darwin_amd64.go
index 9fa879806..0eaecf5fc 100644
--- a/vendor/golang.org/x/sys/unix/syscall_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_darwin_amd64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build amd64 && darwin
-// +build amd64,darwin
package unix
@@ -47,6 +46,5 @@ func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr,
//sys getfsstat(buf unsafe.Pointer, size uintptr, flags int) (n int, err error) = SYS_GETFSSTAT64
//sys Lstat(path string, stat *Stat_t) (err error) = SYS_LSTAT64
//sys ptrace1(request int, pid int, addr uintptr, data uintptr) (err error) = SYS_ptrace
-//sys ptrace1Ptr(request int, pid int, addr unsafe.Pointer, data uintptr) (err error) = SYS_ptrace
//sys Stat(path string, stat *Stat_t) (err error) = SYS_STAT64
//sys Statfs(path string, stat *Statfs_t) (err error) = SYS_STATFS64
diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/syscall_darwin_arm64.go
index f17b8c526..f36c6707c 100644
--- a/vendor/golang.org/x/sys/unix/syscall_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_darwin_arm64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm64 && darwin
-// +build arm64,darwin
package unix
@@ -47,6 +46,5 @@ func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr,
//sys getfsstat(buf unsafe.Pointer, size uintptr, flags int) (n int, err error) = SYS_GETFSSTAT
//sys Lstat(path string, stat *Stat_t) (err error)
//sys ptrace1(request int, pid int, addr uintptr, data uintptr) (err error) = SYS_ptrace
-//sys ptrace1Ptr(request int, pid int, addr unsafe.Pointer, data uintptr) (err error) = SYS_ptrace
//sys Stat(path string, stat *Stat_t) (err error)
//sys Statfs(path string, stat *Statfs_t) (err error)
diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin_libSystem.go b/vendor/golang.org/x/sys/unix/syscall_darwin_libSystem.go
index 53c96641f..2f0fa76e4 100644
--- a/vendor/golang.org/x/sys/unix/syscall_darwin_libSystem.go
+++ b/vendor/golang.org/x/sys/unix/syscall_darwin_libSystem.go
@@ -2,8 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build darwin && go1.12
-// +build darwin,go1.12
+//go:build darwin
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_dragonfly.go b/vendor/golang.org/x/sys/unix/syscall_dragonfly.go
index d4ce988e7..97cb916f2 100644
--- a/vendor/golang.org/x/sys/unix/syscall_dragonfly.go
+++ b/vendor/golang.org/x/sys/unix/syscall_dragonfly.go
@@ -343,203 +343,5 @@ func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err e
//sys write(fd int, p []byte) (n int, err error)
//sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error)
//sys munmap(addr uintptr, length uintptr) (err error)
-//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ
-//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE
//sys accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error)
//sys utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error)
-
-/*
- * Unimplemented
- * TODO(jsing): Update this list for DragonFly.
- */
-// Profil
-// Sigaction
-// Sigprocmask
-// Getlogin
-// Sigpending
-// Sigaltstack
-// Reboot
-// Execve
-// Vfork
-// Sbrk
-// Sstk
-// Ovadvise
-// Mincore
-// Setitimer
-// Swapon
-// Select
-// Sigsuspend
-// Readv
-// Writev
-// Nfssvc
-// Getfh
-// Quotactl
-// Mount
-// Csops
-// Waitid
-// Add_profil
-// Kdebug_trace
-// Sigreturn
-// Atsocket
-// Kqueue_from_portset_np
-// Kqueue_portset
-// Getattrlist
-// Setattrlist
-// Getdirentriesattr
-// Searchfs
-// Delete
-// Copyfile
-// Watchevent
-// Waitevent
-// Modwatch
-// Getxattr
-// Fgetxattr
-// Setxattr
-// Fsetxattr
-// Removexattr
-// Fremovexattr
-// Listxattr
-// Flistxattr
-// Fsctl
-// Initgroups
-// Posix_spawn
-// Nfsclnt
-// Fhopen
-// Minherit
-// Semsys
-// Msgsys
-// Shmsys
-// Semctl
-// Semget
-// Semop
-// Msgctl
-// Msgget
-// Msgsnd
-// Msgrcv
-// Shmat
-// Shmctl
-// Shmdt
-// Shmget
-// Shm_open
-// Shm_unlink
-// Sem_open
-// Sem_close
-// Sem_unlink
-// Sem_wait
-// Sem_trywait
-// Sem_post
-// Sem_getvalue
-// Sem_init
-// Sem_destroy
-// Open_extended
-// Umask_extended
-// Stat_extended
-// Lstat_extended
-// Fstat_extended
-// Chmod_extended
-// Fchmod_extended
-// Access_extended
-// Settid
-// Gettid
-// Setsgroups
-// Getsgroups
-// Setwgroups
-// Getwgroups
-// Mkfifo_extended
-// Mkdir_extended
-// Identitysvc
-// Shared_region_check_np
-// Shared_region_map_np
-// __pthread_mutex_destroy
-// __pthread_mutex_init
-// __pthread_mutex_lock
-// __pthread_mutex_trylock
-// __pthread_mutex_unlock
-// __pthread_cond_init
-// __pthread_cond_destroy
-// __pthread_cond_broadcast
-// __pthread_cond_signal
-// Setsid_with_pid
-// __pthread_cond_timedwait
-// Aio_fsync
-// Aio_return
-// Aio_suspend
-// Aio_cancel
-// Aio_error
-// Aio_read
-// Aio_write
-// Lio_listio
-// __pthread_cond_wait
-// Iopolicysys
-// __pthread_kill
-// __pthread_sigmask
-// __sigwait
-// __disable_threadsignal
-// __pthread_markcancel
-// __pthread_canceled
-// __semwait_signal
-// Proc_info
-// Stat64_extended
-// Lstat64_extended
-// Fstat64_extended
-// __pthread_chdir
-// __pthread_fchdir
-// Audit
-// Auditon
-// Getauid
-// Setauid
-// Getaudit
-// Setaudit
-// Getaudit_addr
-// Setaudit_addr
-// Auditctl
-// Bsdthread_create
-// Bsdthread_terminate
-// Stack_snapshot
-// Bsdthread_register
-// Workq_open
-// Workq_ops
-// __mac_execve
-// __mac_syscall
-// __mac_get_file
-// __mac_set_file
-// __mac_get_link
-// __mac_set_link
-// __mac_get_proc
-// __mac_set_proc
-// __mac_get_fd
-// __mac_set_fd
-// __mac_get_pid
-// __mac_get_lcid
-// __mac_get_lctx
-// __mac_set_lctx
-// Setlcid
-// Read_nocancel
-// Write_nocancel
-// Open_nocancel
-// Close_nocancel
-// Wait4_nocancel
-// Recvmsg_nocancel
-// Sendmsg_nocancel
-// Recvfrom_nocancel
-// Accept_nocancel
-// Fcntl_nocancel
-// Select_nocancel
-// Fsync_nocancel
-// Connect_nocancel
-// Sigsuspend_nocancel
-// Readv_nocancel
-// Writev_nocancel
-// Sendto_nocancel
-// Pread_nocancel
-// Pwrite_nocancel
-// Waitid_nocancel
-// Msgsnd_nocancel
-// Msgrcv_nocancel
-// Sem_wait_nocancel
-// Aio_suspend_nocancel
-// __sigwait_nocancel
-// __semwait_signal_nocancel
-// __mac_mount
-// __mac_get_mount
-// __mac_getfsstat
diff --git a/vendor/golang.org/x/sys/unix/syscall_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/syscall_dragonfly_amd64.go
index 4e2d32120..14bab6b2d 100644
--- a/vendor/golang.org/x/sys/unix/syscall_dragonfly_amd64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_dragonfly_amd64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build amd64 && dragonfly
-// +build amd64,dragonfly
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd.go b/vendor/golang.org/x/sys/unix/syscall_freebsd.go
index afb10106f..2b57e0f73 100644
--- a/vendor/golang.org/x/sys/unix/syscall_freebsd.go
+++ b/vendor/golang.org/x/sys/unix/syscall_freebsd.go
@@ -13,6 +13,7 @@
package unix
import (
+ "errors"
"sync"
"unsafe"
)
@@ -169,25 +170,26 @@ func Getfsstat(buf []Statfs_t, flags int) (n int, err error) {
func Uname(uname *Utsname) error {
mib := []_C_int{CTL_KERN, KERN_OSTYPE}
n := unsafe.Sizeof(uname.Sysname)
- if err := sysctl(mib, &uname.Sysname[0], &n, nil, 0); err != nil {
+ // Suppress ENOMEM errors to be compatible with the C library __xuname() implementation.
+ if err := sysctl(mib, &uname.Sysname[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) {
return err
}
mib = []_C_int{CTL_KERN, KERN_HOSTNAME}
n = unsafe.Sizeof(uname.Nodename)
- if err := sysctl(mib, &uname.Nodename[0], &n, nil, 0); err != nil {
+ if err := sysctl(mib, &uname.Nodename[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) {
return err
}
mib = []_C_int{CTL_KERN, KERN_OSRELEASE}
n = unsafe.Sizeof(uname.Release)
- if err := sysctl(mib, &uname.Release[0], &n, nil, 0); err != nil {
+ if err := sysctl(mib, &uname.Release[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) {
return err
}
mib = []_C_int{CTL_KERN, KERN_VERSION}
n = unsafe.Sizeof(uname.Version)
- if err := sysctl(mib, &uname.Version[0], &n, nil, 0); err != nil {
+ if err := sysctl(mib, &uname.Version[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) {
return err
}
@@ -205,7 +207,7 @@ func Uname(uname *Utsname) error {
mib = []_C_int{CTL_HW, HW_MACHINE}
n = unsafe.Sizeof(uname.Machine)
- if err := sysctl(mib, &uname.Machine[0], &n, nil, 0); err != nil {
+ if err := sysctl(mib, &uname.Machine[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) {
return err
}
@@ -449,197 +451,5 @@ func Dup3(oldfd, newfd, flags int) error {
//sys write(fd int, p []byte) (n int, err error)
//sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error)
//sys munmap(addr uintptr, length uintptr) (err error)
-//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ
-//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE
//sys accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error)
//sys utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error)
-
-/*
- * Unimplemented
- */
-// Profil
-// Sigaction
-// Sigprocmask
-// Getlogin
-// Sigpending
-// Sigaltstack
-// Ioctl
-// Reboot
-// Execve
-// Vfork
-// Sbrk
-// Sstk
-// Ovadvise
-// Mincore
-// Setitimer
-// Swapon
-// Select
-// Sigsuspend
-// Readv
-// Writev
-// Nfssvc
-// Getfh
-// Quotactl
-// Mount
-// Csops
-// Waitid
-// Add_profil
-// Kdebug_trace
-// Sigreturn
-// Atsocket
-// Kqueue_from_portset_np
-// Kqueue_portset
-// Getattrlist
-// Setattrlist
-// Getdents
-// Getdirentriesattr
-// Searchfs
-// Delete
-// Copyfile
-// Watchevent
-// Waitevent
-// Modwatch
-// Fsctl
-// Initgroups
-// Posix_spawn
-// Nfsclnt
-// Fhopen
-// Minherit
-// Semsys
-// Msgsys
-// Shmsys
-// Semctl
-// Semget
-// Semop
-// Msgctl
-// Msgget
-// Msgsnd
-// Msgrcv
-// Shmat
-// Shmctl
-// Shmdt
-// Shmget
-// Shm_open
-// Shm_unlink
-// Sem_open
-// Sem_close
-// Sem_unlink
-// Sem_wait
-// Sem_trywait
-// Sem_post
-// Sem_getvalue
-// Sem_init
-// Sem_destroy
-// Open_extended
-// Umask_extended
-// Stat_extended
-// Lstat_extended
-// Fstat_extended
-// Chmod_extended
-// Fchmod_extended
-// Access_extended
-// Settid
-// Gettid
-// Setsgroups
-// Getsgroups
-// Setwgroups
-// Getwgroups
-// Mkfifo_extended
-// Mkdir_extended
-// Identitysvc
-// Shared_region_check_np
-// Shared_region_map_np
-// __pthread_mutex_destroy
-// __pthread_mutex_init
-// __pthread_mutex_lock
-// __pthread_mutex_trylock
-// __pthread_mutex_unlock
-// __pthread_cond_init
-// __pthread_cond_destroy
-// __pthread_cond_broadcast
-// __pthread_cond_signal
-// Setsid_with_pid
-// __pthread_cond_timedwait
-// Aio_fsync
-// Aio_return
-// Aio_suspend
-// Aio_cancel
-// Aio_error
-// Aio_read
-// Aio_write
-// Lio_listio
-// __pthread_cond_wait
-// Iopolicysys
-// __pthread_kill
-// __pthread_sigmask
-// __sigwait
-// __disable_threadsignal
-// __pthread_markcancel
-// __pthread_canceled
-// __semwait_signal
-// Proc_info
-// Stat64_extended
-// Lstat64_extended
-// Fstat64_extended
-// __pthread_chdir
-// __pthread_fchdir
-// Audit
-// Auditon
-// Getauid
-// Setauid
-// Getaudit
-// Setaudit
-// Getaudit_addr
-// Setaudit_addr
-// Auditctl
-// Bsdthread_create
-// Bsdthread_terminate
-// Stack_snapshot
-// Bsdthread_register
-// Workq_open
-// Workq_ops
-// __mac_execve
-// __mac_syscall
-// __mac_get_file
-// __mac_set_file
-// __mac_get_link
-// __mac_set_link
-// __mac_get_proc
-// __mac_set_proc
-// __mac_get_fd
-// __mac_set_fd
-// __mac_get_pid
-// __mac_get_lcid
-// __mac_get_lctx
-// __mac_set_lctx
-// Setlcid
-// Read_nocancel
-// Write_nocancel
-// Open_nocancel
-// Close_nocancel
-// Wait4_nocancel
-// Recvmsg_nocancel
-// Sendmsg_nocancel
-// Recvfrom_nocancel
-// Accept_nocancel
-// Fcntl_nocancel
-// Select_nocancel
-// Fsync_nocancel
-// Connect_nocancel
-// Sigsuspend_nocancel
-// Readv_nocancel
-// Writev_nocancel
-// Sendto_nocancel
-// Pread_nocancel
-// Pwrite_nocancel
-// Waitid_nocancel
-// Poll_nocancel
-// Msgsnd_nocancel
-// Msgrcv_nocancel
-// Sem_wait_nocancel
-// Aio_suspend_nocancel
-// __sigwait_nocancel
-// __semwait_signal_nocancel
-// __mac_mount
-// __mac_get_mount
-// __mac_getfsstat
diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go
index b8da51004..3967bca77 100644
--- a/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go
+++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build 386 && freebsd
-// +build 386,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go
index 47155c483..eff19ada2 100644
--- a/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build amd64 && freebsd
-// +build amd64,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go
index 08932093f..4f24b517a 100644
--- a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm && freebsd
-// +build arm,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go
index d151a0d0e..ac30759ec 100644
--- a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm64 && freebsd
-// +build arm64,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_riscv64.go
index d5cd64b37..aab725ca7 100644
--- a/vendor/golang.org/x/sys/unix/syscall_freebsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_riscv64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build riscv64 && freebsd
-// +build riscv64,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_hurd.go b/vendor/golang.org/x/sys/unix/syscall_hurd.go
index 381fd4673..ba46651f8 100644
--- a/vendor/golang.org/x/sys/unix/syscall_hurd.go
+++ b/vendor/golang.org/x/sys/unix/syscall_hurd.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build hurd
-// +build hurd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_hurd_386.go b/vendor/golang.org/x/sys/unix/syscall_hurd_386.go
index 7cf54a3e4..df89f9e6b 100644
--- a/vendor/golang.org/x/sys/unix/syscall_hurd_386.go
+++ b/vendor/golang.org/x/sys/unix/syscall_hurd_386.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build 386 && hurd
-// +build 386,hurd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_illumos.go b/vendor/golang.org/x/sys/unix/syscall_illumos.go
index 87db5a6a8..a863f7052 100644
--- a/vendor/golang.org/x/sys/unix/syscall_illumos.go
+++ b/vendor/golang.org/x/sys/unix/syscall_illumos.go
@@ -5,7 +5,6 @@
// illumos system calls not present on Solaris.
//go:build amd64 && illumos
-// +build amd64,illumos
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux.go b/vendor/golang.org/x/sys/unix/syscall_linux.go
index a730878e4..5682e2628 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux.go
@@ -61,15 +61,23 @@ func FanotifyMark(fd int, flags uint, mask uint64, dirFd int, pathname string) (
}
//sys fchmodat(dirfd int, path string, mode uint32) (err error)
+//sys fchmodat2(dirfd int, path string, mode uint32, flags int) (err error)
-func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) {
- // Linux fchmodat doesn't support the flags parameter. Mimick glibc's behavior
- // and check the flags. Otherwise the mode would be applied to the symlink
- // destination which is not what the user expects.
- if flags&^AT_SYMLINK_NOFOLLOW != 0 {
- return EINVAL
- } else if flags&AT_SYMLINK_NOFOLLOW != 0 {
- return EOPNOTSUPP
+func Fchmodat(dirfd int, path string, mode uint32, flags int) error {
+ // Linux fchmodat doesn't support the flags parameter, but fchmodat2 does.
+ // Try fchmodat2 if flags are specified.
+ if flags != 0 {
+ err := fchmodat2(dirfd, path, mode, flags)
+ if err == ENOSYS {
+ // fchmodat2 isn't available. If the flags are known to be valid,
+ // return EOPNOTSUPP to indicate that fchmodat doesn't support them.
+ if flags&^(AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH) != 0 {
+ return EINVAL
+ } else if flags&(AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH) != 0 {
+ return EOPNOTSUPP
+ }
+ }
+ return err
}
return fchmodat(dirfd, path, mode)
}
@@ -417,7 +425,8 @@ func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, _Socklen, error) {
if n > 0 {
sl += _Socklen(n) + 1
}
- if sa.raw.Path[0] == '@' {
+ if sa.raw.Path[0] == '@' || (sa.raw.Path[0] == 0 && sl > 3) {
+ // Check sl > 3 so we don't change unnamed socket behavior.
sa.raw.Path[0] = 0
// Don't count trailing NUL for abstract address.
sl--
@@ -693,10 +702,10 @@ type SockaddrALG struct {
func (sa *SockaddrALG) sockaddr() (unsafe.Pointer, _Socklen, error) {
// Leave room for NUL byte terminator.
- if len(sa.Type) > 13 {
+ if len(sa.Type) > len(sa.raw.Type)-1 {
return nil, 0, EINVAL
}
- if len(sa.Name) > 63 {
+ if len(sa.Name) > len(sa.raw.Name)-1 {
return nil, 0, EINVAL
}
@@ -704,17 +713,8 @@ func (sa *SockaddrALG) sockaddr() (unsafe.Pointer, _Socklen, error) {
sa.raw.Feat = sa.Feature
sa.raw.Mask = sa.Mask
- typ, err := ByteSliceFromString(sa.Type)
- if err != nil {
- return nil, 0, err
- }
- name, err := ByteSliceFromString(sa.Name)
- if err != nil {
- return nil, 0, err
- }
-
- copy(sa.raw.Type[:], typ)
- copy(sa.raw.Name[:], name)
+ copy(sa.raw.Type[:], sa.Type)
+ copy(sa.raw.Name[:], sa.Name)
return unsafe.Pointer(&sa.raw), SizeofSockaddrALG, nil
}
@@ -1310,7 +1310,7 @@ func GetsockoptString(fd, level, opt int) (string, error) {
return "", err
}
}
- return string(buf[:vallen-1]), nil
+ return ByteSliceToString(buf[:vallen]), nil
}
func GetsockoptTpacketStats(fd, level, opt int) (*TpacketStats, error) {
@@ -1849,6 +1849,105 @@ func Dup2(oldfd, newfd int) error {
//sys Fsmount(fd int, flags int, mountAttrs int) (fsfd int, err error)
//sys Fsopen(fsName string, flags int) (fd int, err error)
//sys Fspick(dirfd int, pathName string, flags int) (fd int, err error)
+
+//sys fsconfig(fd int, cmd uint, key *byte, value *byte, aux int) (err error)
+
+func fsconfigCommon(fd int, cmd uint, key string, value *byte, aux int) (err error) {
+ var keyp *byte
+ if keyp, err = BytePtrFromString(key); err != nil {
+ return
+ }
+ return fsconfig(fd, cmd, keyp, value, aux)
+}
+
+// FsconfigSetFlag is equivalent to fsconfig(2) called
+// with cmd == FSCONFIG_SET_FLAG.
+//
+// fd is the filesystem context to act upon.
+// key the parameter key to set.
+func FsconfigSetFlag(fd int, key string) (err error) {
+ return fsconfigCommon(fd, FSCONFIG_SET_FLAG, key, nil, 0)
+}
+
+// FsconfigSetString is equivalent to fsconfig(2) called
+// with cmd == FSCONFIG_SET_STRING.
+//
+// fd is the filesystem context to act upon.
+// key the parameter key to set.
+// value is the parameter value to set.
+func FsconfigSetString(fd int, key string, value string) (err error) {
+ var valuep *byte
+ if valuep, err = BytePtrFromString(value); err != nil {
+ return
+ }
+ return fsconfigCommon(fd, FSCONFIG_SET_STRING, key, valuep, 0)
+}
+
+// FsconfigSetBinary is equivalent to fsconfig(2) called
+// with cmd == FSCONFIG_SET_BINARY.
+//
+// fd is the filesystem context to act upon.
+// key the parameter key to set.
+// value is the parameter value to set.
+func FsconfigSetBinary(fd int, key string, value []byte) (err error) {
+ if len(value) == 0 {
+ return EINVAL
+ }
+ return fsconfigCommon(fd, FSCONFIG_SET_BINARY, key, &value[0], len(value))
+}
+
+// FsconfigSetPath is equivalent to fsconfig(2) called
+// with cmd == FSCONFIG_SET_PATH.
+//
+// fd is the filesystem context to act upon.
+// key the parameter key to set.
+// path is a non-empty path for specified key.
+// atfd is a file descriptor at which to start lookup from or AT_FDCWD.
+func FsconfigSetPath(fd int, key string, path string, atfd int) (err error) {
+ var valuep *byte
+ if valuep, err = BytePtrFromString(path); err != nil {
+ return
+ }
+ return fsconfigCommon(fd, FSCONFIG_SET_PATH, key, valuep, atfd)
+}
+
+// FsconfigSetPathEmpty is equivalent to fsconfig(2) called
+// with cmd == FSCONFIG_SET_PATH_EMPTY. The same as
+// FconfigSetPath but with AT_PATH_EMPTY implied.
+func FsconfigSetPathEmpty(fd int, key string, path string, atfd int) (err error) {
+ var valuep *byte
+ if valuep, err = BytePtrFromString(path); err != nil {
+ return
+ }
+ return fsconfigCommon(fd, FSCONFIG_SET_PATH_EMPTY, key, valuep, atfd)
+}
+
+// FsconfigSetFd is equivalent to fsconfig(2) called
+// with cmd == FSCONFIG_SET_FD.
+//
+// fd is the filesystem context to act upon.
+// key the parameter key to set.
+// value is a file descriptor to be assigned to specified key.
+func FsconfigSetFd(fd int, key string, value int) (err error) {
+ return fsconfigCommon(fd, FSCONFIG_SET_FD, key, nil, value)
+}
+
+// FsconfigCreate is equivalent to fsconfig(2) called
+// with cmd == FSCONFIG_CMD_CREATE.
+//
+// fd is the filesystem context to act upon.
+func FsconfigCreate(fd int) (err error) {
+ return fsconfig(fd, FSCONFIG_CMD_CREATE, nil, nil, 0)
+}
+
+// FsconfigReconfigure is equivalent to fsconfig(2) called
+// with cmd == FSCONFIG_CMD_RECONFIGURE.
+//
+// fd is the filesystem context to act upon.
+func FsconfigReconfigure(fd int) (err error) {
+ return fsconfig(fd, FSCONFIG_CMD_RECONFIGURE, nil, nil, 0)
+}
+
//sys Getdents(fd int, buf []byte) (n int, err error) = SYS_GETDENTS64
//sysnb Getpgid(pid int) (pgid int, err error)
@@ -1988,8 +2087,6 @@ func Signalfd(fd int, sigmask *Sigset_t, flags int) (newfd int, err error) {
//sys Unshare(flags int) (err error)
//sys write(fd int, p []byte) (n int, err error)
//sys exitThread(code int) (err error) = SYS_EXIT
-//sys readlen(fd int, p *byte, np int) (n int, err error) = SYS_READ
-//sys writelen(fd int, p *byte, np int) (n int, err error) = SYS_WRITE
//sys readv(fd int, iovs []Iovec) (n int, err error) = SYS_READV
//sys writev(fd int, iovs []Iovec) (n int, err error) = SYS_WRITEV
//sys preadv(fd int, iovs []Iovec, offs_l uintptr, offs_h uintptr) (n int, err error) = SYS_PREADV
@@ -2471,98 +2568,27 @@ func Pselect(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timespec, sigmask *
return pselect6(nfd, r, w, e, mutableTimeout, kernelMask)
}
-/*
- * Unimplemented
- */
-// AfsSyscall
-// ArchPrctl
-// Brk
-// ClockNanosleep
-// ClockSettime
-// Clone
-// EpollCtlOld
-// EpollPwait
-// EpollWaitOld
-// Execve
-// Fork
-// Futex
-// GetKernelSyms
-// GetMempolicy
-// GetRobustList
-// GetThreadArea
-// Getpmsg
-// IoCancel
-// IoDestroy
-// IoGetevents
-// IoSetup
-// IoSubmit
-// IoprioGet
-// IoprioSet
-// KexecLoad
-// LookupDcookie
-// Mbind
-// MigratePages
-// Mincore
-// ModifyLdt
-// Mount
-// MovePages
-// MqGetsetattr
-// MqNotify
-// MqOpen
-// MqTimedreceive
-// MqTimedsend
-// MqUnlink
-// Msgctl
-// Msgget
-// Msgrcv
-// Msgsnd
-// Nfsservctl
-// Personality
-// Pselect6
-// Ptrace
-// Putpmsg
-// Quotactl
-// Readahead
-// Readv
-// RemapFilePages
-// RestartSyscall
-// RtSigaction
-// RtSigpending
-// RtSigqueueinfo
-// RtSigreturn
-// RtSigsuspend
-// RtSigtimedwait
-// SchedGetPriorityMax
-// SchedGetPriorityMin
-// SchedGetparam
-// SchedGetscheduler
-// SchedRrGetInterval
-// SchedSetparam
-// SchedYield
-// Security
-// Semctl
-// Semget
-// Semop
-// Semtimedop
-// SetMempolicy
-// SetRobustList
-// SetThreadArea
-// SetTidAddress
-// Sigaltstack
-// Swapoff
-// Swapon
-// Sysfs
-// TimerCreate
-// TimerDelete
-// TimerGetoverrun
-// TimerGettime
-// TimerSettime
-// Tkill (obsolete)
-// Tuxcall
-// Umount2
-// Uselib
-// Utimensat
-// Vfork
-// Vhangup
-// Vserver
-// _Sysctl
+//sys schedSetattr(pid int, attr *SchedAttr, flags uint) (err error)
+//sys schedGetattr(pid int, attr *SchedAttr, size uint, flags uint) (err error)
+
+// SchedSetAttr is a wrapper for sched_setattr(2) syscall.
+// https://man7.org/linux/man-pages/man2/sched_setattr.2.html
+func SchedSetAttr(pid int, attr *SchedAttr, flags uint) error {
+ if attr == nil {
+ return EINVAL
+ }
+ attr.Size = SizeofSchedAttr
+ return schedSetattr(pid, attr, flags)
+}
+
+// SchedGetAttr is a wrapper for sched_getattr(2) syscall.
+// https://man7.org/linux/man-pages/man2/sched_getattr.2.html
+func SchedGetAttr(pid int, flags uint) (*SchedAttr, error) {
+ attr := &SchedAttr{}
+ if err := schedGetattr(pid, attr, SizeofSchedAttr, flags); err != nil {
+ return nil, err
+ }
+ return attr, nil
+}
+
+//sys Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error)
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_386.go b/vendor/golang.org/x/sys/unix/syscall_linux_386.go
index c7d9945ea..506dafa7b 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_386.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_386.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build 386 && linux
-// +build 386,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_alarm.go b/vendor/golang.org/x/sys/unix/syscall_linux_alarm.go
index 08086ac6a..38d55641b 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_alarm.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_alarm.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (386 || amd64 || mips || mipsle || mips64 || mipsle || ppc64 || ppc64le || ppc || s390x || sparc64)
-// +build linux
-// +build 386 amd64 mips mipsle mips64 mipsle ppc64 ppc64le ppc s390x sparc64
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go
index 70601ce36..d557cf8de 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build amd64 && linux
-// +build amd64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_amd64_gc.go b/vendor/golang.org/x/sys/unix/syscall_linux_amd64_gc.go
index 8b0f0f3aa..facdb83b2 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_amd64_gc.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_amd64_gc.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build amd64 && linux && gc
-// +build amd64,linux,gc
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_arm.go b/vendor/golang.org/x/sys/unix/syscall_linux_arm.go
index da2986415..cd2dd797f 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_arm.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_arm.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm && linux
-// +build arm,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go
index f5266689a..cf2ee6c75 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm64 && linux
-// +build arm64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gc.go b/vendor/golang.org/x/sys/unix/syscall_linux_gc.go
index 2b1168d7d..ffc4c2b63 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_gc.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_gc.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && gc
-// +build linux,gc
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gc_386.go b/vendor/golang.org/x/sys/unix/syscall_linux_gc_386.go
index 9843fb489..9ebfdcf44 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_gc_386.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_gc_386.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && gc && 386
-// +build linux,gc,386
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gc_arm.go b/vendor/golang.org/x/sys/unix/syscall_linux_gc_arm.go
index a6008fccd..5f2b57c4c 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_gc_arm.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_gc_arm.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm && gc && linux
-// +build arm,gc,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_386.go b/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_386.go
index 7740af242..d1a3ad826 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_386.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_386.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && gccgo && 386
-// +build linux,gccgo,386
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_arm.go b/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_arm.go
index e16a12299..f2f67423e 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_arm.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_arm.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && gccgo && arm
-// +build linux,gccgo,arm
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go
index f6ab02ec1..3d0e98451 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build loong64 && linux
-// +build loong64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go b/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go
index 93fe59d25..70963a95a 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (mips64 || mips64le)
-// +build linux
-// +build mips64 mips64le
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go b/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go
index aae7f0ffd..c218ebd28 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (mips || mipsle)
-// +build linux
-// +build mips mipsle
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go b/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go
index 66eff19a3..e6c48500c 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && ppc
-// +build linux,ppc
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go b/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go
index 806aa2574..7286a9aa8 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (ppc64 || ppc64le)
-// +build linux
-// +build ppc64 ppc64le
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go
index 5e6ceee12..6f5a28894 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build riscv64 && linux
-// +build riscv64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go b/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go
index 2f89e8f5d..66f31210d 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build s390x && linux
-// +build s390x,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go b/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go
index 7ca064ae7..11d1f1698 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build sparc64 && linux
-// +build sparc64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd.go b/vendor/golang.org/x/sys/unix/syscall_netbsd.go
index ddd1ac853..88162099a 100644
--- a/vendor/golang.org/x/sys/unix/syscall_netbsd.go
+++ b/vendor/golang.org/x/sys/unix/syscall_netbsd.go
@@ -356,8 +356,6 @@ func Statvfs(path string, buf *Statvfs_t) (err error) {
//sys write(fd int, p []byte) (n int, err error)
//sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error)
//sys munmap(addr uintptr, length uintptr) (err error)
-//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ
-//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE
//sys utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error)
const (
@@ -371,262 +369,3 @@ const (
func mremap(oldaddr uintptr, oldlength uintptr, newlength uintptr, flags int, newaddr uintptr) (uintptr, error) {
return mremapNetBSD(oldaddr, oldlength, newaddr, newlength, flags)
}
-
-/*
- * Unimplemented
- */
-// ____semctl13
-// __clone
-// __fhopen40
-// __fhstat40
-// __fhstatvfs140
-// __fstat30
-// __getcwd
-// __getfh30
-// __getlogin
-// __lstat30
-// __mount50
-// __msgctl13
-// __msync13
-// __ntp_gettime30
-// __posix_chown
-// __posix_fchown
-// __posix_lchown
-// __posix_rename
-// __setlogin
-// __shmctl13
-// __sigaction_sigtramp
-// __sigaltstack14
-// __sigpending14
-// __sigprocmask14
-// __sigsuspend14
-// __sigtimedwait
-// __stat30
-// __syscall
-// __vfork14
-// _ksem_close
-// _ksem_destroy
-// _ksem_getvalue
-// _ksem_init
-// _ksem_open
-// _ksem_post
-// _ksem_trywait
-// _ksem_unlink
-// _ksem_wait
-// _lwp_continue
-// _lwp_create
-// _lwp_ctl
-// _lwp_detach
-// _lwp_exit
-// _lwp_getname
-// _lwp_getprivate
-// _lwp_kill
-// _lwp_park
-// _lwp_self
-// _lwp_setname
-// _lwp_setprivate
-// _lwp_suspend
-// _lwp_unpark
-// _lwp_unpark_all
-// _lwp_wait
-// _lwp_wakeup
-// _pset_bind
-// _sched_getaffinity
-// _sched_getparam
-// _sched_setaffinity
-// _sched_setparam
-// acct
-// aio_cancel
-// aio_error
-// aio_fsync
-// aio_read
-// aio_return
-// aio_suspend
-// aio_write
-// break
-// clock_getres
-// clock_gettime
-// clock_settime
-// compat_09_ogetdomainname
-// compat_09_osetdomainname
-// compat_09_ouname
-// compat_10_omsgsys
-// compat_10_osemsys
-// compat_10_oshmsys
-// compat_12_fstat12
-// compat_12_getdirentries
-// compat_12_lstat12
-// compat_12_msync
-// compat_12_oreboot
-// compat_12_oswapon
-// compat_12_stat12
-// compat_13_sigaction13
-// compat_13_sigaltstack13
-// compat_13_sigpending13
-// compat_13_sigprocmask13
-// compat_13_sigreturn13
-// compat_13_sigsuspend13
-// compat_14___semctl
-// compat_14_msgctl
-// compat_14_shmctl
-// compat_16___sigaction14
-// compat_16___sigreturn14
-// compat_20_fhstatfs
-// compat_20_fstatfs
-// compat_20_getfsstat
-// compat_20_statfs
-// compat_30___fhstat30
-// compat_30___fstat13
-// compat_30___lstat13
-// compat_30___stat13
-// compat_30_fhopen
-// compat_30_fhstat
-// compat_30_fhstatvfs1
-// compat_30_getdents
-// compat_30_getfh
-// compat_30_ntp_gettime
-// compat_30_socket
-// compat_40_mount
-// compat_43_fstat43
-// compat_43_lstat43
-// compat_43_oaccept
-// compat_43_ocreat
-// compat_43_oftruncate
-// compat_43_ogetdirentries
-// compat_43_ogetdtablesize
-// compat_43_ogethostid
-// compat_43_ogethostname
-// compat_43_ogetkerninfo
-// compat_43_ogetpagesize
-// compat_43_ogetpeername
-// compat_43_ogetrlimit
-// compat_43_ogetsockname
-// compat_43_okillpg
-// compat_43_olseek
-// compat_43_ommap
-// compat_43_oquota
-// compat_43_orecv
-// compat_43_orecvfrom
-// compat_43_orecvmsg
-// compat_43_osend
-// compat_43_osendmsg
-// compat_43_osethostid
-// compat_43_osethostname
-// compat_43_osigblock
-// compat_43_osigsetmask
-// compat_43_osigstack
-// compat_43_osigvec
-// compat_43_otruncate
-// compat_43_owait
-// compat_43_stat43
-// execve
-// extattr_delete_fd
-// extattr_delete_file
-// extattr_delete_link
-// extattr_get_fd
-// extattr_get_file
-// extattr_get_link
-// extattr_list_fd
-// extattr_list_file
-// extattr_list_link
-// extattr_set_fd
-// extattr_set_file
-// extattr_set_link
-// extattrctl
-// fchroot
-// fdatasync
-// fgetxattr
-// fktrace
-// flistxattr
-// fork
-// fremovexattr
-// fsetxattr
-// fstatvfs1
-// fsync_range
-// getcontext
-// getitimer
-// getvfsstat
-// getxattr
-// ktrace
-// lchflags
-// lchmod
-// lfs_bmapv
-// lfs_markv
-// lfs_segclean
-// lfs_segwait
-// lgetxattr
-// lio_listio
-// listxattr
-// llistxattr
-// lremovexattr
-// lseek
-// lsetxattr
-// lutimes
-// madvise
-// mincore
-// minherit
-// modctl
-// mq_close
-// mq_getattr
-// mq_notify
-// mq_open
-// mq_receive
-// mq_send
-// mq_setattr
-// mq_timedreceive
-// mq_timedsend
-// mq_unlink
-// msgget
-// msgrcv
-// msgsnd
-// nfssvc
-// ntp_adjtime
-// pmc_control
-// pmc_get_info
-// pollts
-// preadv
-// profil
-// pselect
-// pset_assign
-// pset_create
-// pset_destroy
-// ptrace
-// pwritev
-// quotactl
-// rasctl
-// readv
-// reboot
-// removexattr
-// sa_enable
-// sa_preempt
-// sa_register
-// sa_setconcurrency
-// sa_stacks
-// sa_yield
-// sbrk
-// sched_yield
-// semconfig
-// semget
-// semop
-// setcontext
-// setitimer
-// setxattr
-// shmat
-// shmdt
-// shmget
-// sstk
-// statvfs1
-// swapctl
-// sysarch
-// syscall
-// timer_create
-// timer_delete
-// timer_getoverrun
-// timer_gettime
-// timer_settime
-// undelete
-// utrace
-// uuidgen
-// vadvise
-// vfork
-// writev
diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd_386.go b/vendor/golang.org/x/sys/unix/syscall_netbsd_386.go
index 5199d282f..7a5eb5743 100644
--- a/vendor/golang.org/x/sys/unix/syscall_netbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/syscall_netbsd_386.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build 386 && netbsd
-// +build 386,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/syscall_netbsd_amd64.go
index 70a9c52e9..62d8957ae 100644
--- a/vendor/golang.org/x/sys/unix/syscall_netbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_netbsd_amd64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build amd64 && netbsd
-// +build amd64,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd_arm.go b/vendor/golang.org/x/sys/unix/syscall_netbsd_arm.go
index 3eb5942f9..ce6a06885 100644
--- a/vendor/golang.org/x/sys/unix/syscall_netbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/syscall_netbsd_arm.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm && netbsd
-// +build arm,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/syscall_netbsd_arm64.go
index fc6ccfd81..d46d689d1 100644
--- a/vendor/golang.org/x/sys/unix/syscall_netbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_netbsd_arm64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm64 && netbsd
-// +build arm64,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd.go b/vendor/golang.org/x/sys/unix/syscall_openbsd.go
index c5f166a11..b25343c71 100644
--- a/vendor/golang.org/x/sys/unix/syscall_openbsd.go
+++ b/vendor/golang.org/x/sys/unix/syscall_openbsd.go
@@ -137,18 +137,13 @@ func sendfile(outfd int, infd int, offset *int64, count int) (written int, err e
}
func Getfsstat(buf []Statfs_t, flags int) (n int, err error) {
- var _p0 unsafe.Pointer
+ var bufptr *Statfs_t
var bufsize uintptr
if len(buf) > 0 {
- _p0 = unsafe.Pointer(&buf[0])
+ bufptr = &buf[0]
bufsize = unsafe.Sizeof(Statfs_t{}) * uintptr(len(buf))
}
- r0, _, e1 := Syscall(SYS_GETFSSTAT, uintptr(_p0), bufsize, uintptr(flags))
- n = int(r0)
- if e1 != 0 {
- err = e1
- }
- return
+ return getfsstat(bufptr, bufsize, flags)
}
//sysnb getresuid(ruid *_C_int, euid *_C_int, suid *_C_int)
@@ -171,6 +166,20 @@ func Getresgid() (rgid, egid, sgid int) {
//sys sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) = SYS___SYSCTL
+//sys fcntl(fd int, cmd int, arg int) (n int, err error)
+//sys fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) = SYS_FCNTL
+
+// FcntlInt performs a fcntl syscall on fd with the provided command and argument.
+func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
+ return fcntl(int(fd), cmd, arg)
+}
+
+// FcntlFlock performs a fcntl syscall for the F_GETLK, F_SETLK or F_SETLKW command.
+func FcntlFlock(fd uintptr, cmd int, lk *Flock_t) error {
+ _, err := fcntlPtr(int(fd), cmd, unsafe.Pointer(lk))
+ return err
+}
+
//sys ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error)
func Ppoll(fds []PollFd, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
@@ -326,78 +335,7 @@ func Uname(uname *Utsname) error {
//sys write(fd int, p []byte) (n int, err error)
//sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error)
//sys munmap(addr uintptr, length uintptr) (err error)
-//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ
-//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE
+//sys getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error)
//sys utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error)
-
-/*
- * Unimplemented
- */
-// __getcwd
-// __semctl
-// __syscall
-// __sysctl
-// adjfreq
-// break
-// clock_getres
-// clock_gettime
-// clock_settime
-// closefrom
-// execve
-// fhopen
-// fhstat
-// fhstatfs
-// fork
-// futimens
-// getfh
-// getgid
-// getitimer
-// getlogin
-// getthrid
-// ktrace
-// lfs_bmapv
-// lfs_markv
-// lfs_segclean
-// lfs_segwait
-// mincore
-// minherit
-// mount
-// mquery
-// msgctl
-// msgget
-// msgrcv
-// msgsnd
-// nfssvc
-// nnpfspioctl
-// preadv
-// profil
-// pwritev
-// quotactl
-// readv
-// reboot
-// renameat
-// rfork
-// sched_yield
-// semget
-// semop
-// setgroups
-// setitimer
-// setsockopt
-// shmat
-// shmctl
-// shmdt
-// shmget
-// sigaction
-// sigaltstack
-// sigpending
-// sigprocmask
-// sigreturn
-// sigsuspend
-// sysarch
-// syscall
-// threxit
-// thrsigdivert
-// thrsleep
-// thrwakeup
-// vfork
-// writev
+//sys pledge(promises *byte, execpromises *byte) (err error)
+//sys unveil(path *byte, flags *byte) (err error)
diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_386.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_386.go
index 6baabcdcb..9ddc89f4f 100644
--- a/vendor/golang.org/x/sys/unix/syscall_openbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_386.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build 386 && openbsd
-// +build 386,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_amd64.go
index bab25360e..70a3c96ee 100644
--- a/vendor/golang.org/x/sys/unix/syscall_openbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_amd64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build amd64 && openbsd
-// +build amd64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_arm.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_arm.go
index 8eed3c4d4..265caa87f 100644
--- a/vendor/golang.org/x/sys/unix/syscall_openbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_arm.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm && openbsd
-// +build arm,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_arm64.go
index 483dde99d..ac4fda171 100644
--- a/vendor/golang.org/x/sys/unix/syscall_openbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_arm64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build arm64 && openbsd
-// +build arm64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_libc.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_libc.go
index 04aa43f41..0a451e6dd 100644
--- a/vendor/golang.org/x/sys/unix/syscall_openbsd_libc.go
+++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_libc.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build openbsd
-// +build openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_ppc64.go
index c2796139c..30a308cbb 100644
--- a/vendor/golang.org/x/sys/unix/syscall_openbsd_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_ppc64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build ppc64 && openbsd
-// +build ppc64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_riscv64.go
index 23199a7ff..ea954330f 100644
--- a/vendor/golang.org/x/sys/unix/syscall_openbsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_riscv64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build riscv64 && openbsd
-// +build riscv64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_solaris.go b/vendor/golang.org/x/sys/unix/syscall_solaris.go
index 72d23575f..21974af06 100644
--- a/vendor/golang.org/x/sys/unix/syscall_solaris.go
+++ b/vendor/golang.org/x/sys/unix/syscall_solaris.go
@@ -128,7 +128,8 @@ func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, _Socklen, error) {
if n > 0 {
sl += _Socklen(n) + 1
}
- if sa.raw.Path[0] == '@' {
+ if sa.raw.Path[0] == '@' || (sa.raw.Path[0] == 0 && sl > 3) {
+ // Check sl > 3 so we don't change unnamed socket behavior.
sa.raw.Path[0] = 0
// Don't count trailing NUL for abstract address.
sl--
@@ -157,7 +158,7 @@ func GetsockoptString(fd, level, opt int) (string, error) {
if err != nil {
return "", err
}
- return string(buf[:vallen-1]), nil
+ return ByteSliceToString(buf[:vallen]), nil
}
const ImplementsGetwd = true
@@ -698,24 +699,6 @@ func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err e
//sys setsockopt(s int, level int, name int, val unsafe.Pointer, vallen uintptr) (err error) = libsocket.setsockopt
//sys recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Socklen) (n int, err error) = libsocket.recvfrom
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procread)), 3, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf), 0, 0, 0)
- n = int(r0)
- if e1 != 0 {
- err = e1
- }
- return
-}
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procwrite)), 3, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf), 0, 0, 0)
- n = int(r0)
- if e1 != 0 {
- err = e1
- }
- return
-}
-
// Event Ports
type fileObjCookie struct {
diff --git a/vendor/golang.org/x/sys/unix/syscall_solaris_amd64.go b/vendor/golang.org/x/sys/unix/syscall_solaris_amd64.go
index 0bd25ef81..e02d8ceae 100644
--- a/vendor/golang.org/x/sys/unix/syscall_solaris_amd64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_solaris_amd64.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build amd64 && solaris
-// +build amd64,solaris
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_unix.go b/vendor/golang.org/x/sys/unix/syscall_unix.go
index 8bb30e7ce..77081de8c 100644
--- a/vendor/golang.org/x/sys/unix/syscall_unix.go
+++ b/vendor/golang.org/x/sys/unix/syscall_unix.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
package unix
@@ -549,6 +548,9 @@ func SetNonblock(fd int, nonblocking bool) (err error) {
if err != nil {
return err
}
+ if (flag&O_NONBLOCK != 0) == nonblocking {
+ return nil
+ }
if nonblocking {
flag |= O_NONBLOCK
} else {
diff --git a/vendor/golang.org/x/sys/unix/syscall_unix_gc.go b/vendor/golang.org/x/sys/unix/syscall_unix_gc.go
index b6919ca58..05c95bccf 100644
--- a/vendor/golang.org/x/sys/unix/syscall_unix_gc.go
+++ b/vendor/golang.org/x/sys/unix/syscall_unix_gc.go
@@ -3,8 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (darwin || dragonfly || freebsd || (linux && !ppc64 && !ppc64le) || netbsd || openbsd || solaris) && gc
-// +build darwin dragonfly freebsd linux,!ppc64,!ppc64le netbsd openbsd solaris
-// +build gc
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_unix_gc_ppc64x.go b/vendor/golang.org/x/sys/unix/syscall_unix_gc_ppc64x.go
index f6f707acf..23f39b7af 100644
--- a/vendor/golang.org/x/sys/unix/syscall_unix_gc_ppc64x.go
+++ b/vendor/golang.org/x/sys/unix/syscall_unix_gc_ppc64x.go
@@ -3,9 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux && (ppc64le || ppc64) && gc
-// +build linux
-// +build ppc64le ppc64
-// +build gc
package unix
diff --git a/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go b/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go
index 44e72edb4..b473038c6 100644
--- a/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build zos && s390x
-// +build zos,s390x
package unix
@@ -192,7 +191,6 @@ func (cmsg *Cmsghdr) SetLen(length int) {
//sys fcntl(fd int, cmd int, arg int) (val int, err error)
//sys read(fd int, p []byte) (n int, err error)
-//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ
//sys write(fd int, p []byte) (n int, err error)
//sys accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) = SYS___ACCEPT_A
@@ -1106,7 +1104,7 @@ func GetsockoptString(fd, level, opt int) (string, error) {
return "", err
}
- return string(buf[:vallen-1]), nil
+ return ByteSliceToString(buf[:vallen]), nil
}
func Recvmsg(fd int, p, oob []byte, flags int) (n, oobn int, recvflags int, from Sockaddr, err error) {
diff --git a/vendor/golang.org/x/sys/unix/sysvshm_linux.go b/vendor/golang.org/x/sys/unix/sysvshm_linux.go
index 2c3a4437f..4fcd38de2 100644
--- a/vendor/golang.org/x/sys/unix/sysvshm_linux.go
+++ b/vendor/golang.org/x/sys/unix/sysvshm_linux.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build linux
-// +build linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/sysvshm_unix.go b/vendor/golang.org/x/sys/unix/sysvshm_unix.go
index 5bb41d17b..79a84f18b 100644
--- a/vendor/golang.org/x/sys/unix/sysvshm_unix.go
+++ b/vendor/golang.org/x/sys/unix/sysvshm_unix.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build (darwin && !ios) || linux
-// +build darwin,!ios linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/sysvshm_unix_other.go b/vendor/golang.org/x/sys/unix/sysvshm_unix_other.go
index 71bddefdb..9eb0db664 100644
--- a/vendor/golang.org/x/sys/unix/sysvshm_unix_other.go
+++ b/vendor/golang.org/x/sys/unix/sysvshm_unix_other.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build darwin && !ios
-// +build darwin,!ios
package unix
diff --git a/vendor/golang.org/x/sys/unix/timestruct.go b/vendor/golang.org/x/sys/unix/timestruct.go
index 616b1b284..7997b1902 100644
--- a/vendor/golang.org/x/sys/unix/timestruct.go
+++ b/vendor/golang.org/x/sys/unix/timestruct.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
package unix
diff --git a/vendor/golang.org/x/sys/unix/unveil_openbsd.go b/vendor/golang.org/x/sys/unix/unveil_openbsd.go
index 168d5ae77..cb7e598ce 100644
--- a/vendor/golang.org/x/sys/unix/unveil_openbsd.go
+++ b/vendor/golang.org/x/sys/unix/unveil_openbsd.go
@@ -4,39 +4,48 @@
package unix
-import (
- "syscall"
- "unsafe"
-)
+import "fmt"
// Unveil implements the unveil syscall.
// For more information see unveil(2).
// Note that the special case of blocking further
// unveil calls is handled by UnveilBlock.
func Unveil(path string, flags string) error {
- pathPtr, err := syscall.BytePtrFromString(path)
+ if err := supportsUnveil(); err != nil {
+ return err
+ }
+ pathPtr, err := BytePtrFromString(path)
if err != nil {
return err
}
- flagsPtr, err := syscall.BytePtrFromString(flags)
+ flagsPtr, err := BytePtrFromString(flags)
if err != nil {
return err
}
- _, _, e := syscall.Syscall(SYS_UNVEIL, uintptr(unsafe.Pointer(pathPtr)), uintptr(unsafe.Pointer(flagsPtr)), 0)
- if e != 0 {
- return e
- }
- return nil
+ return unveil(pathPtr, flagsPtr)
}
// UnveilBlock blocks future unveil calls.
// For more information see unveil(2).
func UnveilBlock() error {
- // Both pointers must be nil.
- var pathUnsafe, flagsUnsafe unsafe.Pointer
- _, _, e := syscall.Syscall(SYS_UNVEIL, uintptr(pathUnsafe), uintptr(flagsUnsafe), 0)
- if e != 0 {
- return e
+ if err := supportsUnveil(); err != nil {
+ return err
}
+ return unveil(nil, nil)
+}
+
+// supportsUnveil checks for availability of the unveil(2) system call based
+// on the running OpenBSD version.
+func supportsUnveil() error {
+ maj, min, err := majmin()
+ if err != nil {
+ return err
+ }
+
+ // unveil is not available before 6.4
+ if maj < 6 || (maj == 6 && min <= 3) {
+ return fmt.Errorf("cannot call Unveil on OpenBSD %d.%d", maj, min)
+ }
+
return nil
}
diff --git a/vendor/golang.org/x/sys/unix/xattr_bsd.go b/vendor/golang.org/x/sys/unix/xattr_bsd.go
index f5f8e9f36..e16879396 100644
--- a/vendor/golang.org/x/sys/unix/xattr_bsd.go
+++ b/vendor/golang.org/x/sys/unix/xattr_bsd.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build freebsd || netbsd
-// +build freebsd netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zerrors_aix_ppc.go b/vendor/golang.org/x/sys/unix/zerrors_aix_ppc.go
index ca9799b79..2fb219d78 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_aix_ppc.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_aix_ppc.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc && aix
-// +build ppc,aix
// Created by cgo -godefs - DO NOT EDIT
// cgo -godefs -- -maix32 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_aix_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_aix_ppc64.go
index 200c8c26f..b0e6f5c85 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_aix_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_aix_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64 && aix
-// +build ppc64,aix
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -maix64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
index 143007627..e40fa8524 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && darwin
-// +build amd64,darwin
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
index ab044a742..bb02aa6c0 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && darwin
-// +build arm64,darwin
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_dragonfly_amd64.go
index 17bba0e44..c0e0f8694 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_dragonfly_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_dragonfly_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && dragonfly
-// +build amd64,dragonfly
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go
index f8c2c5138..6c6923906 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && freebsd
-// +build 386,freebsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m32 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go
index 96310c3be..dd9163f8e 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && freebsd
-// +build amd64,freebsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go
index 777b69def..493a2a793 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && freebsd
-// +build arm,freebsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go
index c557ac2db..8b437b307 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && freebsd
-// +build arm64,freebsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_riscv64.go
index 341b4d962..67c02dd57 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build riscv64 && freebsd
-// +build riscv64,freebsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux.go b/vendor/golang.org/x/sys/unix/zerrors_linux.go
index 3784f402e..36bf8399f 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux.go
@@ -1,7 +1,6 @@
// Code generated by mkmerge; DO NOT EDIT.
//go:build linux
-// +build linux
package unix
@@ -481,10 +480,13 @@ const (
BPF_FROM_BE = 0x8
BPF_FROM_LE = 0x0
BPF_FS_MAGIC = 0xcafe4a11
+ BPF_F_AFTER = 0x10
BPF_F_ALLOW_MULTI = 0x2
BPF_F_ALLOW_OVERRIDE = 0x1
BPF_F_ANY_ALIGNMENT = 0x2
- BPF_F_KPROBE_MULTI_RETURN = 0x1
+ BPF_F_BEFORE = 0x8
+ BPF_F_ID = 0x20
+ BPF_F_NETFILTER_IP_DEFRAG = 0x1
BPF_F_QUERY_EFFECTIVE = 0x1
BPF_F_REPLACE = 0x4
BPF_F_SLEEPABLE = 0x10
@@ -521,6 +523,7 @@ const (
BPF_MAJOR_VERSION = 0x1
BPF_MAXINSNS = 0x1000
BPF_MEM = 0x60
+ BPF_MEMSX = 0x80
BPF_MEMWORDS = 0x10
BPF_MINOR_VERSION = 0x1
BPF_MISC = 0x7
@@ -776,6 +779,8 @@ const (
DEVLINK_GENL_MCGRP_CONFIG_NAME = "config"
DEVLINK_GENL_NAME = "devlink"
DEVLINK_GENL_VERSION = 0x1
+ DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO = 0x4
+ DEVLINK_PORT_FN_CAP_IPSEC_PACKET = 0x8
DEVLINK_PORT_FN_CAP_MIGRATABLE = 0x2
DEVLINK_PORT_FN_CAP_ROCE = 0x1
DEVLINK_SB_THRESHOLD_TO_ALPHA_MAX = 0x14
@@ -1698,6 +1703,7 @@ const (
KEXEC_ON_CRASH = 0x1
KEXEC_PRESERVE_CONTEXT = 0x2
KEXEC_SEGMENT_MAX = 0x10
+ KEXEC_UPDATE_ELFCOREHDR = 0x4
KEYCTL_ASSUME_AUTHORITY = 0x10
KEYCTL_CAPABILITIES = 0x1f
KEYCTL_CAPS0_BIG_KEY = 0x10
@@ -1779,6 +1785,8 @@ const (
LANDLOCK_ACCESS_FS_REMOVE_FILE = 0x20
LANDLOCK_ACCESS_FS_TRUNCATE = 0x4000
LANDLOCK_ACCESS_FS_WRITE_FILE = 0x2
+ LANDLOCK_ACCESS_NET_BIND_TCP = 0x1
+ LANDLOCK_ACCESS_NET_CONNECT_TCP = 0x2
LANDLOCK_CREATE_RULESET_VERSION = 0x1
LINUX_REBOOT_CMD_CAD_OFF = 0x0
LINUX_REBOOT_CMD_CAD_ON = 0x89abcdef
@@ -1795,6 +1803,7 @@ const (
LOCK_SH = 0x1
LOCK_UN = 0x8
LOOP_CLR_FD = 0x4c01
+ LOOP_CONFIGURE = 0x4c0a
LOOP_CTL_ADD = 0x4c80
LOOP_CTL_GET_FREE = 0x4c82
LOOP_CTL_REMOVE = 0x4c81
@@ -2120,6 +2129,60 @@ const (
NFNL_SUBSYS_QUEUE = 0x3
NFNL_SUBSYS_ULOG = 0x4
NFS_SUPER_MAGIC = 0x6969
+ NFT_CHAIN_FLAGS = 0x7
+ NFT_CHAIN_MAXNAMELEN = 0x100
+ NFT_CT_MAX = 0x17
+ NFT_DATA_RESERVED_MASK = 0xffffff00
+ NFT_DATA_VALUE_MAXLEN = 0x40
+ NFT_EXTHDR_OP_MAX = 0x4
+ NFT_FIB_RESULT_MAX = 0x3
+ NFT_INNER_MASK = 0xf
+ NFT_LOGLEVEL_MAX = 0x8
+ NFT_NAME_MAXLEN = 0x100
+ NFT_NG_MAX = 0x1
+ NFT_OBJECT_CONNLIMIT = 0x5
+ NFT_OBJECT_COUNTER = 0x1
+ NFT_OBJECT_CT_EXPECT = 0x9
+ NFT_OBJECT_CT_HELPER = 0x3
+ NFT_OBJECT_CT_TIMEOUT = 0x7
+ NFT_OBJECT_LIMIT = 0x4
+ NFT_OBJECT_MAX = 0xa
+ NFT_OBJECT_QUOTA = 0x2
+ NFT_OBJECT_SECMARK = 0x8
+ NFT_OBJECT_SYNPROXY = 0xa
+ NFT_OBJECT_TUNNEL = 0x6
+ NFT_OBJECT_UNSPEC = 0x0
+ NFT_OBJ_MAXNAMELEN = 0x100
+ NFT_OSF_MAXGENRELEN = 0x10
+ NFT_QUEUE_FLAG_BYPASS = 0x1
+ NFT_QUEUE_FLAG_CPU_FANOUT = 0x2
+ NFT_QUEUE_FLAG_MASK = 0x3
+ NFT_REG32_COUNT = 0x10
+ NFT_REG32_SIZE = 0x4
+ NFT_REG_MAX = 0x4
+ NFT_REG_SIZE = 0x10
+ NFT_REJECT_ICMPX_MAX = 0x3
+ NFT_RT_MAX = 0x4
+ NFT_SECMARK_CTX_MAXLEN = 0x100
+ NFT_SET_MAXNAMELEN = 0x100
+ NFT_SOCKET_MAX = 0x3
+ NFT_TABLE_F_MASK = 0x3
+ NFT_TABLE_MAXNAMELEN = 0x100
+ NFT_TRACETYPE_MAX = 0x3
+ NFT_TUNNEL_F_MASK = 0x7
+ NFT_TUNNEL_MAX = 0x1
+ NFT_TUNNEL_MODE_MAX = 0x2
+ NFT_USERDATA_MAXLEN = 0x100
+ NFT_XFRM_KEY_MAX = 0x6
+ NF_NAT_RANGE_MAP_IPS = 0x1
+ NF_NAT_RANGE_MASK = 0x7f
+ NF_NAT_RANGE_NETMAP = 0x40
+ NF_NAT_RANGE_PERSISTENT = 0x8
+ NF_NAT_RANGE_PROTO_OFFSET = 0x20
+ NF_NAT_RANGE_PROTO_RANDOM = 0x4
+ NF_NAT_RANGE_PROTO_RANDOM_ALL = 0x14
+ NF_NAT_RANGE_PROTO_RANDOM_FULLY = 0x10
+ NF_NAT_RANGE_PROTO_SPECIFIED = 0x2
NILFS_SUPER_MAGIC = 0x3434
NL0 = 0x0
NL1 = 0x100
@@ -2275,6 +2338,7 @@ const (
PERF_MEM_LVLNUM_PMEM = 0xe
PERF_MEM_LVLNUM_RAM = 0xd
PERF_MEM_LVLNUM_SHIFT = 0x21
+ PERF_MEM_LVLNUM_UNC = 0x8
PERF_MEM_LVL_HIT = 0x2
PERF_MEM_LVL_IO = 0x1000
PERF_MEM_LVL_L1 = 0x8
@@ -2403,6 +2467,7 @@ const (
PR_MCE_KILL_GET = 0x22
PR_MCE_KILL_LATE = 0x0
PR_MCE_KILL_SET = 0x1
+ PR_MDWE_NO_INHERIT = 0x2
PR_MDWE_REFUSE_EXEC_GAIN = 0x1
PR_MPX_DISABLE_MANAGEMENT = 0x2c
PR_MPX_ENABLE_MANAGEMENT = 0x2b
@@ -2421,6 +2486,15 @@ const (
PR_PAC_GET_ENABLED_KEYS = 0x3d
PR_PAC_RESET_KEYS = 0x36
PR_PAC_SET_ENABLED_KEYS = 0x3c
+ PR_RISCV_V_GET_CONTROL = 0x46
+ PR_RISCV_V_SET_CONTROL = 0x45
+ PR_RISCV_V_VSTATE_CTRL_CUR_MASK = 0x3
+ PR_RISCV_V_VSTATE_CTRL_DEFAULT = 0x0
+ PR_RISCV_V_VSTATE_CTRL_INHERIT = 0x10
+ PR_RISCV_V_VSTATE_CTRL_MASK = 0x1f
+ PR_RISCV_V_VSTATE_CTRL_NEXT_MASK = 0xc
+ PR_RISCV_V_VSTATE_CTRL_OFF = 0x1
+ PR_RISCV_V_VSTATE_CTRL_ON = 0x2
PR_SCHED_CORE = 0x3e
PR_SCHED_CORE_CREATE = 0x1
PR_SCHED_CORE_GET = 0x0
@@ -2598,8 +2672,9 @@ const (
RTAX_FEATURES = 0xc
RTAX_FEATURE_ALLFRAG = 0x8
RTAX_FEATURE_ECN = 0x1
- RTAX_FEATURE_MASK = 0xf
+ RTAX_FEATURE_MASK = 0x1f
RTAX_FEATURE_SACK = 0x2
+ RTAX_FEATURE_TCP_USEC_TS = 0x10
RTAX_FEATURE_TIMESTAMP = 0x4
RTAX_HOPLIMIT = 0xa
RTAX_INITCWND = 0xb
@@ -2821,13 +2896,59 @@ const (
RWF_SUPPORTED = 0x1f
RWF_SYNC = 0x4
RWF_WRITE_LIFE_NOT_SET = 0x0
+ SCHED_BATCH = 0x3
+ SCHED_DEADLINE = 0x6
+ SCHED_FIFO = 0x1
+ SCHED_FLAG_ALL = 0x7f
+ SCHED_FLAG_DL_OVERRUN = 0x4
+ SCHED_FLAG_KEEP_ALL = 0x18
+ SCHED_FLAG_KEEP_PARAMS = 0x10
+ SCHED_FLAG_KEEP_POLICY = 0x8
+ SCHED_FLAG_RECLAIM = 0x2
+ SCHED_FLAG_RESET_ON_FORK = 0x1
+ SCHED_FLAG_UTIL_CLAMP = 0x60
+ SCHED_FLAG_UTIL_CLAMP_MAX = 0x40
+ SCHED_FLAG_UTIL_CLAMP_MIN = 0x20
+ SCHED_IDLE = 0x5
+ SCHED_NORMAL = 0x0
+ SCHED_RESET_ON_FORK = 0x40000000
+ SCHED_RR = 0x2
SCM_CREDENTIALS = 0x2
SCM_RIGHTS = 0x1
SCM_TIMESTAMP = 0x1d
SC_LOG_FLUSH = 0x100000
+ SECCOMP_ADDFD_FLAG_SEND = 0x2
+ SECCOMP_ADDFD_FLAG_SETFD = 0x1
+ SECCOMP_FILTER_FLAG_LOG = 0x2
+ SECCOMP_FILTER_FLAG_NEW_LISTENER = 0x8
+ SECCOMP_FILTER_FLAG_SPEC_ALLOW = 0x4
+ SECCOMP_FILTER_FLAG_TSYNC = 0x1
+ SECCOMP_FILTER_FLAG_TSYNC_ESRCH = 0x10
+ SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV = 0x20
+ SECCOMP_GET_ACTION_AVAIL = 0x2
+ SECCOMP_GET_NOTIF_SIZES = 0x3
+ SECCOMP_IOCTL_NOTIF_RECV = 0xc0502100
+ SECCOMP_IOCTL_NOTIF_SEND = 0xc0182101
+ SECCOMP_IOC_MAGIC = '!'
SECCOMP_MODE_DISABLED = 0x0
SECCOMP_MODE_FILTER = 0x2
SECCOMP_MODE_STRICT = 0x1
+ SECCOMP_RET_ACTION = 0x7fff0000
+ SECCOMP_RET_ACTION_FULL = 0xffff0000
+ SECCOMP_RET_ALLOW = 0x7fff0000
+ SECCOMP_RET_DATA = 0xffff
+ SECCOMP_RET_ERRNO = 0x50000
+ SECCOMP_RET_KILL = 0x0
+ SECCOMP_RET_KILL_PROCESS = 0x80000000
+ SECCOMP_RET_KILL_THREAD = 0x0
+ SECCOMP_RET_LOG = 0x7ffc0000
+ SECCOMP_RET_TRACE = 0x7ff00000
+ SECCOMP_RET_TRAP = 0x30000
+ SECCOMP_RET_USER_NOTIF = 0x7fc00000
+ SECCOMP_SET_MODE_FILTER = 0x1
+ SECCOMP_SET_MODE_STRICT = 0x0
+ SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP = 0x1
+ SECCOMP_USER_NOTIF_FLAG_CONTINUE = 0x1
SECRETMEM_MAGIC = 0x5345434d
SECURITYFS_MAGIC = 0x73636673
SEEK_CUR = 0x1
@@ -2987,6 +3108,7 @@ const (
SOL_TIPC = 0x10f
SOL_TLS = 0x11a
SOL_UDP = 0x11
+ SOL_VSOCK = 0x11f
SOL_X25 = 0x106
SOL_XDP = 0x11b
SOMAXCONN = 0x1000
@@ -3435,6 +3557,7 @@ const (
XDP_PACKET_HEADROOM = 0x100
XDP_PGOFF_RX_RING = 0x0
XDP_PGOFF_TX_RING = 0x80000000
+ XDP_PKT_CONTD = 0x1
XDP_RING_NEED_WAKEUP = 0x1
XDP_RX_RING = 0x2
XDP_SHARED_UMEM = 0x1
@@ -3447,6 +3570,7 @@ const (
XDP_UMEM_REG = 0x4
XDP_UMEM_UNALIGNED_CHUNK_FLAG = 0x1
XDP_USE_NEED_WAKEUP = 0x8
+ XDP_USE_SG = 0x10
XDP_ZEROCOPY = 0x4
XENFS_SUPER_MAGIC = 0xabba1974
XFS_SUPER_MAGIC = 0x58465342
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go
index cfb143001..42ff8c3c1 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && linux
-// +build 386,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/386/include -m32 _const.go
@@ -282,6 +281,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -326,10 +328,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x10
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x11
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
index df64f2d59..dca436004 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && linux
-// +build amd64,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/amd64/include -m64 _const.go
@@ -283,6 +282,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -327,10 +329,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x10
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x11
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go
index 3025cd5b2..5cca668ac 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && linux
-// +build arm,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/arm/include _const.go
@@ -289,6 +288,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -333,10 +335,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x10
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x11
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
index 09e1ffbef..d8cae6d15 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && linux
-// +build arm64,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/arm64/include -fsigned-char _const.go
@@ -279,6 +278,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -323,10 +325,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x10
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x11
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go
index a45723540..28e39afdc 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build loong64 && linux
-// +build loong64,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/loong64/include _const.go
@@ -118,6 +117,9 @@ const (
IUCLC = 0x200
IXOFF = 0x1000
IXON = 0x400
+ LASX_CTX_MAGIC = 0x41535801
+ LBT_CTX_MAGIC = 0x42540001
+ LSX_CTX_MAGIC = 0x53580001
MAP_ANON = 0x20
MAP_ANONYMOUS = 0x20
MAP_DENYWRITE = 0x800
@@ -273,6 +275,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -317,10 +322,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x10
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x11
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go
index fee7dfb81..cd66e92cb 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips && linux
-// +build mips,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/mips/include _const.go
@@ -282,6 +281,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x80
SIOCATMARK = 0x40047307
@@ -326,10 +328,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0x100
SO_PASSCRED = 0x11
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x12
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1e
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x1028
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go
index a5b2373ae..c1595eba7 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips64 && linux
-// +build mips64,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/mips64/include _const.go
@@ -282,6 +281,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x80
SIOCATMARK = 0x40047307
@@ -326,10 +328,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0x100
SO_PASSCRED = 0x11
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x12
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1e
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x1028
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go
index 5dde82c98..ee9456b0d 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips64le && linux
-// +build mips64le,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/mips64le/include _const.go
@@ -282,6 +281,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x80
SIOCATMARK = 0x40047307
@@ -326,10 +328,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0x100
SO_PASSCRED = 0x11
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x12
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1e
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x1028
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go
index 2e80ea6b3..8cfca81e1 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mipsle && linux
-// +build mipsle,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/mipsle/include _const.go
@@ -282,6 +281,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x80
SIOCATMARK = 0x40047307
@@ -326,10 +328,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0x100
SO_PASSCRED = 0x11
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x12
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1e
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x1028
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go
index a65dcd7cb..60b0deb3a 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc && linux
-// +build ppc,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/ppc/include _const.go
@@ -337,6 +336,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -381,10 +383,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x14
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x15
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go
index cbd34e3d8..f90aa7281 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64 && linux
-// +build ppc64,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/ppc64/include _const.go
@@ -341,6 +340,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -385,10 +387,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x14
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x15
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go
index e4afa7a31..ba9e01503 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64le && linux
-// +build ppc64le,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/ppc64le/include _const.go
@@ -341,6 +340,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -385,10 +387,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x14
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x15
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go
index 44f45a039..07cdfd6e9 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build riscv64 && linux
-// +build riscv64,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/riscv64/include _const.go
@@ -228,6 +227,9 @@ const (
PPPIOCUNBRIDGECHAN = 0x7434
PPPIOCXFERUNIT = 0x744e
PR_SET_PTRACER_ANY = 0xffffffffffffffff
+ PTRACE_GETFDPIC = 0x21
+ PTRACE_GETFDPIC_EXEC = 0x0
+ PTRACE_GETFDPIC_INTERP = 0x1
RLIMIT_AS = 0x9
RLIMIT_MEMLOCK = 0x8
RLIMIT_NOFILE = 0x7
@@ -270,6 +272,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -314,10 +319,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x10
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x11
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go
index 74733e260..2f1dd214a 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build s390x && linux
-// +build s390x,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/s390x/include -fsigned-char _const.go
@@ -345,6 +344,9 @@ const (
SCM_TIMESTAMPNS = 0x23
SCM_TXTIME = 0x3d
SCM_WIFI_STATUS = 0x29
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104
SFD_CLOEXEC = 0x80000
SFD_NONBLOCK = 0x800
SIOCATMARK = 0x8905
@@ -389,10 +391,12 @@ const (
SO_NOFCS = 0x2b
SO_OOBINLINE = 0xa
SO_PASSCRED = 0x10
+ SO_PASSPIDFD = 0x4c
SO_PASSSEC = 0x22
SO_PEEK_OFF = 0x2a
SO_PEERCRED = 0x11
SO_PEERGROUPS = 0x3b
+ SO_PEERPIDFD = 0x4d
SO_PEERSEC = 0x1f
SO_PREFER_BUSY_POLL = 0x45
SO_PROTOCOL = 0x26
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go
index f5f3934b1..f40519d90 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build sparc64 && linux
-// +build sparc64,linux
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -Wall -Werror -static -I/tmp/sparc64/include _const.go
@@ -336,6 +335,9 @@ const (
SCM_TIMESTAMPNS = 0x21
SCM_TXTIME = 0x3f
SCM_WIFI_STATUS = 0x25
+ SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103
+ SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102
+ SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104
SFD_CLOEXEC = 0x400000
SFD_NONBLOCK = 0x4000
SF_FP = 0x38
@@ -428,10 +430,12 @@ const (
SO_NOFCS = 0x27
SO_OOBINLINE = 0x100
SO_PASSCRED = 0x2
+ SO_PASSPIDFD = 0x55
SO_PASSSEC = 0x1f
SO_PEEK_OFF = 0x26
SO_PEERCRED = 0x40
SO_PEERGROUPS = 0x3d
+ SO_PEERPIDFD = 0x56
SO_PEERSEC = 0x1e
SO_PREFER_BUSY_POLL = 0x48
SO_PROTOCOL = 0x1028
diff --git a/vendor/golang.org/x/sys/unix/zerrors_netbsd_386.go b/vendor/golang.org/x/sys/unix/zerrors_netbsd_386.go
index 72f7420d2..130085df4 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_netbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_netbsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && netbsd
-// +build 386,netbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m32 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_netbsd_amd64.go
index 8d4eb0c08..84769a1a3 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_netbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_netbsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && netbsd
-// +build amd64,netbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm.go b/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm.go
index 9eef9749f..602ded003 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && netbsd
-// +build arm,netbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -marm _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm64.go
index 3b62ba192..efc0406ee 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && netbsd
-// +build arm64,netbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_386.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_386.go
index af20e474b..5a6500f83 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && openbsd
-// +build 386,openbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m32 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_amd64.go
index 6015fcb2b..a5aeeb979 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && openbsd
-// +build amd64,openbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm.go
index 8d44955e4..0e9748a72 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && openbsd
-// +build arm,openbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm64.go
index ae16fe754..4f4449abc 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && openbsd
-// +build arm64,openbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_mips64.go
index 03d90fe35..76a363f0f 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_mips64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips64 && openbsd
-// +build mips64,openbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_ppc64.go
index 8e2c51b1e..43ca0cdfd 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64 && openbsd
-// +build ppc64,openbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_riscv64.go
index 13d403031..b1b8bb200 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build riscv64 && openbsd
-// +build riscv64,openbsd
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_solaris_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_solaris_amd64.go
index 1afee6a08..d2ddd3176 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_solaris_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_solaris_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && solaris
-// +build amd64,solaris
// Code generated by cmd/cgo -godefs; DO NOT EDIT.
// cgo -godefs -- -m64 _const.go
diff --git a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
index fc7d0506f..4dfd2e051 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build zos && s390x
-// +build zos,s390x
// Hand edited based on zerrors_linux_s390x.go
// TODO: auto-generate.
diff --git a/vendor/golang.org/x/sys/unix/zptrace_armnn_linux.go b/vendor/golang.org/x/sys/unix/zptrace_armnn_linux.go
index 97f20ca28..586317c78 100644
--- a/vendor/golang.org/x/sys/unix/zptrace_armnn_linux.go
+++ b/vendor/golang.org/x/sys/unix/zptrace_armnn_linux.go
@@ -1,8 +1,6 @@
// Code generated by linux/mkall.go generatePtracePair("arm", "arm64"). DO NOT EDIT.
//go:build linux && (arm || arm64)
-// +build linux
-// +build arm arm64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zptrace_mipsnn_linux.go b/vendor/golang.org/x/sys/unix/zptrace_mipsnn_linux.go
index 0b5f79430..d7c881be7 100644
--- a/vendor/golang.org/x/sys/unix/zptrace_mipsnn_linux.go
+++ b/vendor/golang.org/x/sys/unix/zptrace_mipsnn_linux.go
@@ -1,8 +1,6 @@
// Code generated by linux/mkall.go generatePtracePair("mips", "mips64"). DO NOT EDIT.
//go:build linux && (mips || mips64)
-// +build linux
-// +build mips mips64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zptrace_mipsnnle_linux.go b/vendor/golang.org/x/sys/unix/zptrace_mipsnnle_linux.go
index 2807f7e64..2d2de5d29 100644
--- a/vendor/golang.org/x/sys/unix/zptrace_mipsnnle_linux.go
+++ b/vendor/golang.org/x/sys/unix/zptrace_mipsnnle_linux.go
@@ -1,8 +1,6 @@
// Code generated by linux/mkall.go generatePtracePair("mipsle", "mips64le"). DO NOT EDIT.
//go:build linux && (mipsle || mips64le)
-// +build linux
-// +build mipsle mips64le
package unix
diff --git a/vendor/golang.org/x/sys/unix/zptrace_x86_linux.go b/vendor/golang.org/x/sys/unix/zptrace_x86_linux.go
index 281ea64e3..5adc79fb5 100644
--- a/vendor/golang.org/x/sys/unix/zptrace_x86_linux.go
+++ b/vendor/golang.org/x/sys/unix/zptrace_x86_linux.go
@@ -1,8 +1,6 @@
// Code generated by linux/mkall.go generatePtracePair("386", "amd64"). DO NOT EDIT.
//go:build linux && (386 || amd64)
-// +build linux
-// +build 386 amd64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc.go b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc.go
index 9a257219d..6ea64a3c0 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build aix && ppc
-// +build aix,ppc
package unix
@@ -817,28 +816,6 @@ func write(fd int, p []byte) (n int, err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, p *byte, np int) (n int, err error) {
- r0, er := C.read(C.int(fd), C.uintptr_t(uintptr(unsafe.Pointer(p))), C.size_t(np))
- n = int(r0)
- if r0 == -1 && er != nil {
- err = er
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, p *byte, np int) (n int, err error) {
- r0, er := C.write(C.int(fd), C.uintptr_t(uintptr(unsafe.Pointer(p))), C.size_t(np))
- n = int(r0)
- if r0 == -1 && er != nil {
- err = er
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func Dup2(oldfd int, newfd int) (err error) {
r0, er := C.dup2(C.int(oldfd), C.int(newfd))
if r0 == -1 && er != nil {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64.go b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64.go
index 6de80c20c..99ee4399a 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build aix && ppc64
-// +build aix,ppc64
package unix
@@ -762,28 +761,6 @@ func write(fd int, p []byte) (n int, err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, p *byte, np int) (n int, err error) {
- r0, e1 := callread(fd, uintptr(unsafe.Pointer(p)), np)
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, p *byte, np int) (n int, err error) {
- r0, e1 := callwrite(fd, uintptr(unsafe.Pointer(p)), np)
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func Dup2(oldfd int, newfd int) (err error) {
_, e1 := calldup2(oldfd, newfd)
if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gc.go b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gc.go
index c4d50ae50..b68a78362 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gc.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gc.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build aix && ppc64 && gc
-// +build aix,ppc64,gc
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gccgo.go b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gccgo.go
index 6903d3b09..0a87450bf 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gccgo.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gccgo.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build aix && ppc64 && gccgo
-// +build aix,ppc64,gccgo
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
index 4037ccf7a..ccb02f240 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build darwin && amd64
-// +build darwin,amd64
package unix
@@ -725,6 +724,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) {
return
}
+var libc_ioctl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_ioctl ioctl "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
_, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg))
if e1 != 0 {
@@ -733,10 +738,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
return
}
-var libc_ioctl_trampoline_addr uintptr
-
-//go:cgo_import_dynamic libc_ioctl ioctl "/usr/lib/libSystem.B.dylib"
-
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
@@ -2410,28 +2411,6 @@ var libc_munmap_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func Fstat(fd int, stat *Stat_t) (err error) {
_, _, e1 := syscall_syscall(libc_fstat64_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0)
if e1 != 0 {
@@ -2521,14 +2500,6 @@ func ptrace1(request int, pid int, addr uintptr, data uintptr) (err error) {
return
}
-func ptrace1Ptr(request int, pid int, addr uintptr, data unsafe.Pointer) (err error) {
- _, _, e1 := syscall_syscall6(libc_ptrace_trampoline_addr, uintptr(request), uintptr(pid), addr, uintptr(data), 0, 0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
var libc_ptrace_trampoline_addr uintptr
//go:cgo_import_dynamic libc_ptrace ptrace "/usr/lib/libSystem.B.dylib"
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
index 4baaed0bc..8b8bb2840 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
@@ -5,703 +5,586 @@
TEXT libc_fdopendir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fdopendir(SB)
-
GLOBL ·libc_fdopendir_trampoline_addr(SB), RODATA, $8
DATA ·libc_fdopendir_trampoline_addr(SB)/8, $libc_fdopendir_trampoline<>(SB)
TEXT libc_getgroups_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getgroups(SB)
-
GLOBL ·libc_getgroups_trampoline_addr(SB), RODATA, $8
DATA ·libc_getgroups_trampoline_addr(SB)/8, $libc_getgroups_trampoline<>(SB)
TEXT libc_setgroups_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setgroups(SB)
-
GLOBL ·libc_setgroups_trampoline_addr(SB), RODATA, $8
DATA ·libc_setgroups_trampoline_addr(SB)/8, $libc_setgroups_trampoline<>(SB)
TEXT libc_wait4_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_wait4(SB)
-
GLOBL ·libc_wait4_trampoline_addr(SB), RODATA, $8
DATA ·libc_wait4_trampoline_addr(SB)/8, $libc_wait4_trampoline<>(SB)
TEXT libc_accept_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_accept(SB)
-
GLOBL ·libc_accept_trampoline_addr(SB), RODATA, $8
DATA ·libc_accept_trampoline_addr(SB)/8, $libc_accept_trampoline<>(SB)
TEXT libc_bind_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_bind(SB)
-
GLOBL ·libc_bind_trampoline_addr(SB), RODATA, $8
DATA ·libc_bind_trampoline_addr(SB)/8, $libc_bind_trampoline<>(SB)
TEXT libc_connect_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_connect(SB)
-
GLOBL ·libc_connect_trampoline_addr(SB), RODATA, $8
DATA ·libc_connect_trampoline_addr(SB)/8, $libc_connect_trampoline<>(SB)
TEXT libc_socket_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_socket(SB)
-
GLOBL ·libc_socket_trampoline_addr(SB), RODATA, $8
DATA ·libc_socket_trampoline_addr(SB)/8, $libc_socket_trampoline<>(SB)
TEXT libc_getsockopt_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getsockopt(SB)
-
GLOBL ·libc_getsockopt_trampoline_addr(SB), RODATA, $8
DATA ·libc_getsockopt_trampoline_addr(SB)/8, $libc_getsockopt_trampoline<>(SB)
TEXT libc_setsockopt_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setsockopt(SB)
-
GLOBL ·libc_setsockopt_trampoline_addr(SB), RODATA, $8
DATA ·libc_setsockopt_trampoline_addr(SB)/8, $libc_setsockopt_trampoline<>(SB)
TEXT libc_getpeername_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpeername(SB)
-
GLOBL ·libc_getpeername_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpeername_trampoline_addr(SB)/8, $libc_getpeername_trampoline<>(SB)
TEXT libc_getsockname_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getsockname(SB)
-
GLOBL ·libc_getsockname_trampoline_addr(SB), RODATA, $8
DATA ·libc_getsockname_trampoline_addr(SB)/8, $libc_getsockname_trampoline<>(SB)
TEXT libc_shutdown_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shutdown(SB)
-
GLOBL ·libc_shutdown_trampoline_addr(SB), RODATA, $8
DATA ·libc_shutdown_trampoline_addr(SB)/8, $libc_shutdown_trampoline<>(SB)
TEXT libc_socketpair_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_socketpair(SB)
-
GLOBL ·libc_socketpair_trampoline_addr(SB), RODATA, $8
DATA ·libc_socketpair_trampoline_addr(SB)/8, $libc_socketpair_trampoline<>(SB)
TEXT libc_recvfrom_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_recvfrom(SB)
-
GLOBL ·libc_recvfrom_trampoline_addr(SB), RODATA, $8
DATA ·libc_recvfrom_trampoline_addr(SB)/8, $libc_recvfrom_trampoline<>(SB)
TEXT libc_sendto_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sendto(SB)
-
GLOBL ·libc_sendto_trampoline_addr(SB), RODATA, $8
DATA ·libc_sendto_trampoline_addr(SB)/8, $libc_sendto_trampoline<>(SB)
TEXT libc_recvmsg_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_recvmsg(SB)
-
GLOBL ·libc_recvmsg_trampoline_addr(SB), RODATA, $8
DATA ·libc_recvmsg_trampoline_addr(SB)/8, $libc_recvmsg_trampoline<>(SB)
TEXT libc_sendmsg_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sendmsg(SB)
-
GLOBL ·libc_sendmsg_trampoline_addr(SB), RODATA, $8
DATA ·libc_sendmsg_trampoline_addr(SB)/8, $libc_sendmsg_trampoline<>(SB)
TEXT libc_kevent_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_kevent(SB)
-
GLOBL ·libc_kevent_trampoline_addr(SB), RODATA, $8
DATA ·libc_kevent_trampoline_addr(SB)/8, $libc_kevent_trampoline<>(SB)
TEXT libc_utimes_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimes(SB)
-
GLOBL ·libc_utimes_trampoline_addr(SB), RODATA, $8
DATA ·libc_utimes_trampoline_addr(SB)/8, $libc_utimes_trampoline<>(SB)
TEXT libc_futimes_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_futimes(SB)
-
GLOBL ·libc_futimes_trampoline_addr(SB), RODATA, $8
DATA ·libc_futimes_trampoline_addr(SB)/8, $libc_futimes_trampoline<>(SB)
TEXT libc_poll_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_poll(SB)
-
GLOBL ·libc_poll_trampoline_addr(SB), RODATA, $8
DATA ·libc_poll_trampoline_addr(SB)/8, $libc_poll_trampoline<>(SB)
TEXT libc_madvise_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_madvise(SB)
-
GLOBL ·libc_madvise_trampoline_addr(SB), RODATA, $8
DATA ·libc_madvise_trampoline_addr(SB)/8, $libc_madvise_trampoline<>(SB)
TEXT libc_mlock_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mlock(SB)
-
GLOBL ·libc_mlock_trampoline_addr(SB), RODATA, $8
DATA ·libc_mlock_trampoline_addr(SB)/8, $libc_mlock_trampoline<>(SB)
TEXT libc_mlockall_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mlockall(SB)
-
GLOBL ·libc_mlockall_trampoline_addr(SB), RODATA, $8
DATA ·libc_mlockall_trampoline_addr(SB)/8, $libc_mlockall_trampoline<>(SB)
TEXT libc_mprotect_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mprotect(SB)
-
GLOBL ·libc_mprotect_trampoline_addr(SB), RODATA, $8
DATA ·libc_mprotect_trampoline_addr(SB)/8, $libc_mprotect_trampoline<>(SB)
TEXT libc_msync_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_msync(SB)
-
GLOBL ·libc_msync_trampoline_addr(SB), RODATA, $8
DATA ·libc_msync_trampoline_addr(SB)/8, $libc_msync_trampoline<>(SB)
TEXT libc_munlock_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_munlock(SB)
-
GLOBL ·libc_munlock_trampoline_addr(SB), RODATA, $8
DATA ·libc_munlock_trampoline_addr(SB)/8, $libc_munlock_trampoline<>(SB)
TEXT libc_munlockall_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_munlockall(SB)
-
GLOBL ·libc_munlockall_trampoline_addr(SB), RODATA, $8
DATA ·libc_munlockall_trampoline_addr(SB)/8, $libc_munlockall_trampoline<>(SB)
TEXT libc_closedir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_closedir(SB)
-
GLOBL ·libc_closedir_trampoline_addr(SB), RODATA, $8
DATA ·libc_closedir_trampoline_addr(SB)/8, $libc_closedir_trampoline<>(SB)
TEXT libc_readdir_r_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_readdir_r(SB)
-
GLOBL ·libc_readdir_r_trampoline_addr(SB), RODATA, $8
DATA ·libc_readdir_r_trampoline_addr(SB)/8, $libc_readdir_r_trampoline<>(SB)
TEXT libc_pipe_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_pipe(SB)
-
GLOBL ·libc_pipe_trampoline_addr(SB), RODATA, $8
DATA ·libc_pipe_trampoline_addr(SB)/8, $libc_pipe_trampoline<>(SB)
TEXT libc_getxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getxattr(SB)
-
GLOBL ·libc_getxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_getxattr_trampoline_addr(SB)/8, $libc_getxattr_trampoline<>(SB)
TEXT libc_fgetxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fgetxattr(SB)
-
GLOBL ·libc_fgetxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_fgetxattr_trampoline_addr(SB)/8, $libc_fgetxattr_trampoline<>(SB)
TEXT libc_setxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setxattr(SB)
-
GLOBL ·libc_setxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_setxattr_trampoline_addr(SB)/8, $libc_setxattr_trampoline<>(SB)
TEXT libc_fsetxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fsetxattr(SB)
-
GLOBL ·libc_fsetxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_fsetxattr_trampoline_addr(SB)/8, $libc_fsetxattr_trampoline<>(SB)
TEXT libc_removexattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_removexattr(SB)
-
GLOBL ·libc_removexattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_removexattr_trampoline_addr(SB)/8, $libc_removexattr_trampoline<>(SB)
TEXT libc_fremovexattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fremovexattr(SB)
-
GLOBL ·libc_fremovexattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_fremovexattr_trampoline_addr(SB)/8, $libc_fremovexattr_trampoline<>(SB)
TEXT libc_listxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_listxattr(SB)
-
GLOBL ·libc_listxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_listxattr_trampoline_addr(SB)/8, $libc_listxattr_trampoline<>(SB)
TEXT libc_flistxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_flistxattr(SB)
-
GLOBL ·libc_flistxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_flistxattr_trampoline_addr(SB)/8, $libc_flistxattr_trampoline<>(SB)
TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimensat(SB)
-
GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8
DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB)
TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fcntl(SB)
-
GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8
DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB)
TEXT libc_kill_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_kill(SB)
-
GLOBL ·libc_kill_trampoline_addr(SB), RODATA, $8
DATA ·libc_kill_trampoline_addr(SB)/8, $libc_kill_trampoline<>(SB)
TEXT libc_ioctl_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ioctl(SB)
-
GLOBL ·libc_ioctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_ioctl_trampoline_addr(SB)/8, $libc_ioctl_trampoline<>(SB)
TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sysctl(SB)
-
GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sendfile(SB)
-
GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8
DATA ·libc_sendfile_trampoline_addr(SB)/8, $libc_sendfile_trampoline<>(SB)
TEXT libc_shmat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shmat(SB)
-
GLOBL ·libc_shmat_trampoline_addr(SB), RODATA, $8
DATA ·libc_shmat_trampoline_addr(SB)/8, $libc_shmat_trampoline<>(SB)
TEXT libc_shmctl_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shmctl(SB)
-
GLOBL ·libc_shmctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_shmctl_trampoline_addr(SB)/8, $libc_shmctl_trampoline<>(SB)
TEXT libc_shmdt_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shmdt(SB)
-
GLOBL ·libc_shmdt_trampoline_addr(SB), RODATA, $8
DATA ·libc_shmdt_trampoline_addr(SB)/8, $libc_shmdt_trampoline<>(SB)
TEXT libc_shmget_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shmget(SB)
-
GLOBL ·libc_shmget_trampoline_addr(SB), RODATA, $8
DATA ·libc_shmget_trampoline_addr(SB)/8, $libc_shmget_trampoline<>(SB)
TEXT libc_access_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_access(SB)
-
GLOBL ·libc_access_trampoline_addr(SB), RODATA, $8
DATA ·libc_access_trampoline_addr(SB)/8, $libc_access_trampoline<>(SB)
TEXT libc_adjtime_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_adjtime(SB)
-
GLOBL ·libc_adjtime_trampoline_addr(SB), RODATA, $8
DATA ·libc_adjtime_trampoline_addr(SB)/8, $libc_adjtime_trampoline<>(SB)
TEXT libc_chdir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chdir(SB)
-
GLOBL ·libc_chdir_trampoline_addr(SB), RODATA, $8
DATA ·libc_chdir_trampoline_addr(SB)/8, $libc_chdir_trampoline<>(SB)
TEXT libc_chflags_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chflags(SB)
-
GLOBL ·libc_chflags_trampoline_addr(SB), RODATA, $8
DATA ·libc_chflags_trampoline_addr(SB)/8, $libc_chflags_trampoline<>(SB)
TEXT libc_chmod_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chmod(SB)
-
GLOBL ·libc_chmod_trampoline_addr(SB), RODATA, $8
DATA ·libc_chmod_trampoline_addr(SB)/8, $libc_chmod_trampoline<>(SB)
TEXT libc_chown_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chown(SB)
-
GLOBL ·libc_chown_trampoline_addr(SB), RODATA, $8
DATA ·libc_chown_trampoline_addr(SB)/8, $libc_chown_trampoline<>(SB)
TEXT libc_chroot_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chroot(SB)
-
GLOBL ·libc_chroot_trampoline_addr(SB), RODATA, $8
DATA ·libc_chroot_trampoline_addr(SB)/8, $libc_chroot_trampoline<>(SB)
TEXT libc_clock_gettime_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_clock_gettime(SB)
-
GLOBL ·libc_clock_gettime_trampoline_addr(SB), RODATA, $8
DATA ·libc_clock_gettime_trampoline_addr(SB)/8, $libc_clock_gettime_trampoline<>(SB)
TEXT libc_close_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_close(SB)
-
GLOBL ·libc_close_trampoline_addr(SB), RODATA, $8
DATA ·libc_close_trampoline_addr(SB)/8, $libc_close_trampoline<>(SB)
TEXT libc_clonefile_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_clonefile(SB)
-
GLOBL ·libc_clonefile_trampoline_addr(SB), RODATA, $8
DATA ·libc_clonefile_trampoline_addr(SB)/8, $libc_clonefile_trampoline<>(SB)
TEXT libc_clonefileat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_clonefileat(SB)
-
GLOBL ·libc_clonefileat_trampoline_addr(SB), RODATA, $8
DATA ·libc_clonefileat_trampoline_addr(SB)/8, $libc_clonefileat_trampoline<>(SB)
TEXT libc_dup_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_dup(SB)
-
GLOBL ·libc_dup_trampoline_addr(SB), RODATA, $8
DATA ·libc_dup_trampoline_addr(SB)/8, $libc_dup_trampoline<>(SB)
TEXT libc_dup2_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_dup2(SB)
-
GLOBL ·libc_dup2_trampoline_addr(SB), RODATA, $8
DATA ·libc_dup2_trampoline_addr(SB)/8, $libc_dup2_trampoline<>(SB)
TEXT libc_exchangedata_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_exchangedata(SB)
-
GLOBL ·libc_exchangedata_trampoline_addr(SB), RODATA, $8
DATA ·libc_exchangedata_trampoline_addr(SB)/8, $libc_exchangedata_trampoline<>(SB)
TEXT libc_exit_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_exit(SB)
-
GLOBL ·libc_exit_trampoline_addr(SB), RODATA, $8
DATA ·libc_exit_trampoline_addr(SB)/8, $libc_exit_trampoline<>(SB)
TEXT libc_faccessat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_faccessat(SB)
-
GLOBL ·libc_faccessat_trampoline_addr(SB), RODATA, $8
DATA ·libc_faccessat_trampoline_addr(SB)/8, $libc_faccessat_trampoline<>(SB)
TEXT libc_fchdir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchdir(SB)
-
GLOBL ·libc_fchdir_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchdir_trampoline_addr(SB)/8, $libc_fchdir_trampoline<>(SB)
TEXT libc_fchflags_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchflags(SB)
-
GLOBL ·libc_fchflags_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchflags_trampoline_addr(SB)/8, $libc_fchflags_trampoline<>(SB)
TEXT libc_fchmod_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchmod(SB)
-
GLOBL ·libc_fchmod_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchmod_trampoline_addr(SB)/8, $libc_fchmod_trampoline<>(SB)
TEXT libc_fchmodat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchmodat(SB)
-
GLOBL ·libc_fchmodat_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchmodat_trampoline_addr(SB)/8, $libc_fchmodat_trampoline<>(SB)
TEXT libc_fchown_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchown(SB)
-
GLOBL ·libc_fchown_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchown_trampoline_addr(SB)/8, $libc_fchown_trampoline<>(SB)
TEXT libc_fchownat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchownat(SB)
-
GLOBL ·libc_fchownat_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchownat_trampoline_addr(SB)/8, $libc_fchownat_trampoline<>(SB)
TEXT libc_fclonefileat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fclonefileat(SB)
-
GLOBL ·libc_fclonefileat_trampoline_addr(SB), RODATA, $8
DATA ·libc_fclonefileat_trampoline_addr(SB)/8, $libc_fclonefileat_trampoline<>(SB)
TEXT libc_flock_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_flock(SB)
-
GLOBL ·libc_flock_trampoline_addr(SB), RODATA, $8
DATA ·libc_flock_trampoline_addr(SB)/8, $libc_flock_trampoline<>(SB)
TEXT libc_fpathconf_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fpathconf(SB)
-
GLOBL ·libc_fpathconf_trampoline_addr(SB), RODATA, $8
DATA ·libc_fpathconf_trampoline_addr(SB)/8, $libc_fpathconf_trampoline<>(SB)
TEXT libc_fsync_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fsync(SB)
-
GLOBL ·libc_fsync_trampoline_addr(SB), RODATA, $8
DATA ·libc_fsync_trampoline_addr(SB)/8, $libc_fsync_trampoline<>(SB)
TEXT libc_ftruncate_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ftruncate(SB)
-
GLOBL ·libc_ftruncate_trampoline_addr(SB), RODATA, $8
DATA ·libc_ftruncate_trampoline_addr(SB)/8, $libc_ftruncate_trampoline<>(SB)
TEXT libc_getcwd_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getcwd(SB)
-
GLOBL ·libc_getcwd_trampoline_addr(SB), RODATA, $8
DATA ·libc_getcwd_trampoline_addr(SB)/8, $libc_getcwd_trampoline<>(SB)
TEXT libc_getdtablesize_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getdtablesize(SB)
-
GLOBL ·libc_getdtablesize_trampoline_addr(SB), RODATA, $8
DATA ·libc_getdtablesize_trampoline_addr(SB)/8, $libc_getdtablesize_trampoline<>(SB)
TEXT libc_getegid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getegid(SB)
-
GLOBL ·libc_getegid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getegid_trampoline_addr(SB)/8, $libc_getegid_trampoline<>(SB)
TEXT libc_geteuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_geteuid(SB)
-
GLOBL ·libc_geteuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_geteuid_trampoline_addr(SB)/8, $libc_geteuid_trampoline<>(SB)
TEXT libc_getgid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getgid(SB)
-
GLOBL ·libc_getgid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getgid_trampoline_addr(SB)/8, $libc_getgid_trampoline<>(SB)
TEXT libc_getpgid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpgid(SB)
-
GLOBL ·libc_getpgid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpgid_trampoline_addr(SB)/8, $libc_getpgid_trampoline<>(SB)
TEXT libc_getpgrp_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpgrp(SB)
-
GLOBL ·libc_getpgrp_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpgrp_trampoline_addr(SB)/8, $libc_getpgrp_trampoline<>(SB)
TEXT libc_getpid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpid(SB)
-
GLOBL ·libc_getpid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpid_trampoline_addr(SB)/8, $libc_getpid_trampoline<>(SB)
TEXT libc_getppid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getppid(SB)
-
GLOBL ·libc_getppid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getppid_trampoline_addr(SB)/8, $libc_getppid_trampoline<>(SB)
TEXT libc_getpriority_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpriority(SB)
-
GLOBL ·libc_getpriority_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpriority_trampoline_addr(SB)/8, $libc_getpriority_trampoline<>(SB)
TEXT libc_getrlimit_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getrlimit(SB)
-
GLOBL ·libc_getrlimit_trampoline_addr(SB), RODATA, $8
DATA ·libc_getrlimit_trampoline_addr(SB)/8, $libc_getrlimit_trampoline<>(SB)
TEXT libc_getrusage_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getrusage(SB)
-
GLOBL ·libc_getrusage_trampoline_addr(SB), RODATA, $8
DATA ·libc_getrusage_trampoline_addr(SB)/8, $libc_getrusage_trampoline<>(SB)
TEXT libc_getsid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getsid(SB)
-
GLOBL ·libc_getsid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getsid_trampoline_addr(SB)/8, $libc_getsid_trampoline<>(SB)
TEXT libc_gettimeofday_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_gettimeofday(SB)
-
GLOBL ·libc_gettimeofday_trampoline_addr(SB), RODATA, $8
DATA ·libc_gettimeofday_trampoline_addr(SB)/8, $libc_gettimeofday_trampoline<>(SB)
TEXT libc_getuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getuid(SB)
-
GLOBL ·libc_getuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getuid_trampoline_addr(SB)/8, $libc_getuid_trampoline<>(SB)
TEXT libc_issetugid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_issetugid(SB)
-
GLOBL ·libc_issetugid_trampoline_addr(SB), RODATA, $8
DATA ·libc_issetugid_trampoline_addr(SB)/8, $libc_issetugid_trampoline<>(SB)
TEXT libc_kqueue_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_kqueue(SB)
-
GLOBL ·libc_kqueue_trampoline_addr(SB), RODATA, $8
DATA ·libc_kqueue_trampoline_addr(SB)/8, $libc_kqueue_trampoline<>(SB)
TEXT libc_lchown_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_lchown(SB)
-
GLOBL ·libc_lchown_trampoline_addr(SB), RODATA, $8
DATA ·libc_lchown_trampoline_addr(SB)/8, $libc_lchown_trampoline<>(SB)
TEXT libc_link_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_link(SB)
-
GLOBL ·libc_link_trampoline_addr(SB), RODATA, $8
DATA ·libc_link_trampoline_addr(SB)/8, $libc_link_trampoline<>(SB)
TEXT libc_linkat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_linkat(SB)
-
GLOBL ·libc_linkat_trampoline_addr(SB), RODATA, $8
DATA ·libc_linkat_trampoline_addr(SB)/8, $libc_linkat_trampoline<>(SB)
TEXT libc_listen_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_listen(SB)
-
GLOBL ·libc_listen_trampoline_addr(SB), RODATA, $8
DATA ·libc_listen_trampoline_addr(SB)/8, $libc_listen_trampoline<>(SB)
TEXT libc_mkdir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mkdir(SB)
-
GLOBL ·libc_mkdir_trampoline_addr(SB), RODATA, $8
DATA ·libc_mkdir_trampoline_addr(SB)/8, $libc_mkdir_trampoline<>(SB)
TEXT libc_mkdirat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mkdirat(SB)
-
GLOBL ·libc_mkdirat_trampoline_addr(SB), RODATA, $8
DATA ·libc_mkdirat_trampoline_addr(SB)/8, $libc_mkdirat_trampoline<>(SB)
TEXT libc_mkfifo_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mkfifo(SB)
-
GLOBL ·libc_mkfifo_trampoline_addr(SB), RODATA, $8
DATA ·libc_mkfifo_trampoline_addr(SB)/8, $libc_mkfifo_trampoline<>(SB)
TEXT libc_mknod_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mknod(SB)
-
GLOBL ·libc_mknod_trampoline_addr(SB), RODATA, $8
DATA ·libc_mknod_trampoline_addr(SB)/8, $libc_mknod_trampoline<>(SB)
TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mount(SB)
-
GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8
DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB)
TEXT libc_open_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_open(SB)
-
GLOBL ·libc_open_trampoline_addr(SB), RODATA, $8
DATA ·libc_open_trampoline_addr(SB)/8, $libc_open_trampoline<>(SB)
TEXT libc_openat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_openat(SB)
-
GLOBL ·libc_openat_trampoline_addr(SB), RODATA, $8
DATA ·libc_openat_trampoline_addr(SB)/8, $libc_openat_trampoline<>(SB)
TEXT libc_pathconf_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_pathconf(SB)
-
GLOBL ·libc_pathconf_trampoline_addr(SB), RODATA, $8
DATA ·libc_pathconf_trampoline_addr(SB)/8, $libc_pathconf_trampoline<>(SB)
TEXT libc_pread_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_pread(SB)
-
GLOBL ·libc_pread_trampoline_addr(SB), RODATA, $8
DATA ·libc_pread_trampoline_addr(SB)/8, $libc_pread_trampoline<>(SB)
TEXT libc_pwrite_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_pwrite(SB)
-
GLOBL ·libc_pwrite_trampoline_addr(SB), RODATA, $8
DATA ·libc_pwrite_trampoline_addr(SB)/8, $libc_pwrite_trampoline<>(SB)
TEXT libc_read_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_read(SB)
-
GLOBL ·libc_read_trampoline_addr(SB), RODATA, $8
DATA ·libc_read_trampoline_addr(SB)/8, $libc_read_trampoline<>(SB)
TEXT libc_readlink_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_readlink(SB)
-
GLOBL ·libc_readlink_trampoline_addr(SB), RODATA, $8
DATA ·libc_readlink_trampoline_addr(SB)/8, $libc_readlink_trampoline<>(SB)
TEXT libc_readlinkat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_readlinkat(SB)
-
GLOBL ·libc_readlinkat_trampoline_addr(SB), RODATA, $8
DATA ·libc_readlinkat_trampoline_addr(SB)/8, $libc_readlinkat_trampoline<>(SB)
TEXT libc_rename_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_rename(SB)
-
GLOBL ·libc_rename_trampoline_addr(SB), RODATA, $8
DATA ·libc_rename_trampoline_addr(SB)/8, $libc_rename_trampoline<>(SB)
TEXT libc_renameat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_renameat(SB)
-
GLOBL ·libc_renameat_trampoline_addr(SB), RODATA, $8
DATA ·libc_renameat_trampoline_addr(SB)/8, $libc_renameat_trampoline<>(SB)
TEXT libc_revoke_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_revoke(SB)
-
GLOBL ·libc_revoke_trampoline_addr(SB), RODATA, $8
DATA ·libc_revoke_trampoline_addr(SB)/8, $libc_revoke_trampoline<>(SB)
TEXT libc_rmdir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_rmdir(SB)
-
GLOBL ·libc_rmdir_trampoline_addr(SB), RODATA, $8
DATA ·libc_rmdir_trampoline_addr(SB)/8, $libc_rmdir_trampoline<>(SB)
TEXT libc_lseek_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_lseek(SB)
-
GLOBL ·libc_lseek_trampoline_addr(SB), RODATA, $8
DATA ·libc_lseek_trampoline_addr(SB)/8, $libc_lseek_trampoline<>(SB)
TEXT libc_select_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_select(SB)
-
GLOBL ·libc_select_trampoline_addr(SB), RODATA, $8
DATA ·libc_select_trampoline_addr(SB)/8, $libc_select_trampoline<>(SB)
@@ -712,192 +595,160 @@ DATA ·libc_setattrlist_trampoline_addr(SB)/8, $libc_setattrlist_trampoline<>(SB
TEXT libc_setegid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setegid(SB)
-
GLOBL ·libc_setegid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setegid_trampoline_addr(SB)/8, $libc_setegid_trampoline<>(SB)
TEXT libc_seteuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_seteuid(SB)
-
GLOBL ·libc_seteuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_seteuid_trampoline_addr(SB)/8, $libc_seteuid_trampoline<>(SB)
TEXT libc_setgid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setgid(SB)
-
GLOBL ·libc_setgid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setgid_trampoline_addr(SB)/8, $libc_setgid_trampoline<>(SB)
TEXT libc_setlogin_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setlogin(SB)
-
GLOBL ·libc_setlogin_trampoline_addr(SB), RODATA, $8
DATA ·libc_setlogin_trampoline_addr(SB)/8, $libc_setlogin_trampoline<>(SB)
TEXT libc_setpgid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setpgid(SB)
-
GLOBL ·libc_setpgid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setpgid_trampoline_addr(SB)/8, $libc_setpgid_trampoline<>(SB)
TEXT libc_setpriority_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setpriority(SB)
-
GLOBL ·libc_setpriority_trampoline_addr(SB), RODATA, $8
DATA ·libc_setpriority_trampoline_addr(SB)/8, $libc_setpriority_trampoline<>(SB)
TEXT libc_setprivexec_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setprivexec(SB)
-
GLOBL ·libc_setprivexec_trampoline_addr(SB), RODATA, $8
DATA ·libc_setprivexec_trampoline_addr(SB)/8, $libc_setprivexec_trampoline<>(SB)
TEXT libc_setregid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setregid(SB)
-
GLOBL ·libc_setregid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setregid_trampoline_addr(SB)/8, $libc_setregid_trampoline<>(SB)
TEXT libc_setreuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setreuid(SB)
-
GLOBL ·libc_setreuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setreuid_trampoline_addr(SB)/8, $libc_setreuid_trampoline<>(SB)
TEXT libc_setsid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setsid(SB)
-
GLOBL ·libc_setsid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setsid_trampoline_addr(SB)/8, $libc_setsid_trampoline<>(SB)
TEXT libc_settimeofday_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_settimeofday(SB)
-
GLOBL ·libc_settimeofday_trampoline_addr(SB), RODATA, $8
DATA ·libc_settimeofday_trampoline_addr(SB)/8, $libc_settimeofday_trampoline<>(SB)
TEXT libc_setuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setuid(SB)
-
GLOBL ·libc_setuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setuid_trampoline_addr(SB)/8, $libc_setuid_trampoline<>(SB)
TEXT libc_symlink_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_symlink(SB)
-
GLOBL ·libc_symlink_trampoline_addr(SB), RODATA, $8
DATA ·libc_symlink_trampoline_addr(SB)/8, $libc_symlink_trampoline<>(SB)
TEXT libc_symlinkat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_symlinkat(SB)
-
GLOBL ·libc_symlinkat_trampoline_addr(SB), RODATA, $8
DATA ·libc_symlinkat_trampoline_addr(SB)/8, $libc_symlinkat_trampoline<>(SB)
TEXT libc_sync_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sync(SB)
-
GLOBL ·libc_sync_trampoline_addr(SB), RODATA, $8
DATA ·libc_sync_trampoline_addr(SB)/8, $libc_sync_trampoline<>(SB)
TEXT libc_truncate_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_truncate(SB)
-
GLOBL ·libc_truncate_trampoline_addr(SB), RODATA, $8
DATA ·libc_truncate_trampoline_addr(SB)/8, $libc_truncate_trampoline<>(SB)
TEXT libc_umask_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_umask(SB)
-
GLOBL ·libc_umask_trampoline_addr(SB), RODATA, $8
DATA ·libc_umask_trampoline_addr(SB)/8, $libc_umask_trampoline<>(SB)
TEXT libc_undelete_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_undelete(SB)
-
GLOBL ·libc_undelete_trampoline_addr(SB), RODATA, $8
DATA ·libc_undelete_trampoline_addr(SB)/8, $libc_undelete_trampoline<>(SB)
TEXT libc_unlink_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_unlink(SB)
-
GLOBL ·libc_unlink_trampoline_addr(SB), RODATA, $8
DATA ·libc_unlink_trampoline_addr(SB)/8, $libc_unlink_trampoline<>(SB)
TEXT libc_unlinkat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_unlinkat(SB)
-
GLOBL ·libc_unlinkat_trampoline_addr(SB), RODATA, $8
DATA ·libc_unlinkat_trampoline_addr(SB)/8, $libc_unlinkat_trampoline<>(SB)
TEXT libc_unmount_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_unmount(SB)
-
GLOBL ·libc_unmount_trampoline_addr(SB), RODATA, $8
DATA ·libc_unmount_trampoline_addr(SB)/8, $libc_unmount_trampoline<>(SB)
TEXT libc_write_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_write(SB)
-
GLOBL ·libc_write_trampoline_addr(SB), RODATA, $8
DATA ·libc_write_trampoline_addr(SB)/8, $libc_write_trampoline<>(SB)
TEXT libc_mmap_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mmap(SB)
-
GLOBL ·libc_mmap_trampoline_addr(SB), RODATA, $8
DATA ·libc_mmap_trampoline_addr(SB)/8, $libc_mmap_trampoline<>(SB)
TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_munmap(SB)
-
GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8
DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB)
TEXT libc_fstat64_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fstat64(SB)
-
GLOBL ·libc_fstat64_trampoline_addr(SB), RODATA, $8
DATA ·libc_fstat64_trampoline_addr(SB)/8, $libc_fstat64_trampoline<>(SB)
TEXT libc_fstatat64_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fstatat64(SB)
-
GLOBL ·libc_fstatat64_trampoline_addr(SB), RODATA, $8
DATA ·libc_fstatat64_trampoline_addr(SB)/8, $libc_fstatat64_trampoline<>(SB)
TEXT libc_fstatfs64_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fstatfs64(SB)
-
GLOBL ·libc_fstatfs64_trampoline_addr(SB), RODATA, $8
DATA ·libc_fstatfs64_trampoline_addr(SB)/8, $libc_fstatfs64_trampoline<>(SB)
TEXT libc_getfsstat64_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getfsstat64(SB)
-
GLOBL ·libc_getfsstat64_trampoline_addr(SB), RODATA, $8
DATA ·libc_getfsstat64_trampoline_addr(SB)/8, $libc_getfsstat64_trampoline<>(SB)
TEXT libc_lstat64_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_lstat64(SB)
-
GLOBL ·libc_lstat64_trampoline_addr(SB), RODATA, $8
DATA ·libc_lstat64_trampoline_addr(SB)/8, $libc_lstat64_trampoline<>(SB)
TEXT libc_ptrace_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ptrace(SB)
-
GLOBL ·libc_ptrace_trampoline_addr(SB), RODATA, $8
DATA ·libc_ptrace_trampoline_addr(SB)/8, $libc_ptrace_trampoline<>(SB)
TEXT libc_stat64_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_stat64(SB)
-
GLOBL ·libc_stat64_trampoline_addr(SB), RODATA, $8
DATA ·libc_stat64_trampoline_addr(SB)/8, $libc_stat64_trampoline<>(SB)
TEXT libc_statfs64_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_statfs64(SB)
-
GLOBL ·libc_statfs64_trampoline_addr(SB), RODATA, $8
DATA ·libc_statfs64_trampoline_addr(SB)/8, $libc_statfs64_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
index 51d6f3fb2..1b40b997b 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build darwin && arm64
-// +build darwin,arm64
package unix
@@ -725,6 +724,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) {
return
}
+var libc_ioctl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_ioctl ioctl "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
_, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg))
if e1 != 0 {
@@ -733,10 +738,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
return
}
-var libc_ioctl_trampoline_addr uintptr
-
-//go:cgo_import_dynamic libc_ioctl ioctl "/usr/lib/libSystem.B.dylib"
-
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
@@ -2410,28 +2411,6 @@ var libc_munmap_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func Fstat(fd int, stat *Stat_t) (err error) {
_, _, e1 := syscall_syscall(libc_fstat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0)
if e1 != 0 {
@@ -2521,14 +2500,6 @@ func ptrace1(request int, pid int, addr uintptr, data uintptr) (err error) {
return
}
-func ptrace1Ptr(request int, pid int, addr uintptr, data unsafe.Pointer) (err error) {
- _, _, e1 := syscall_syscall6(libc_ptrace_trampoline_addr, uintptr(request), uintptr(pid), addr, uintptr(data), 0, 0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
var libc_ptrace_trampoline_addr uintptr
//go:cgo_import_dynamic libc_ptrace ptrace "/usr/lib/libSystem.B.dylib"
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
index c3b82c037..08362c1ab 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
@@ -5,703 +5,586 @@
TEXT libc_fdopendir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fdopendir(SB)
-
GLOBL ·libc_fdopendir_trampoline_addr(SB), RODATA, $8
DATA ·libc_fdopendir_trampoline_addr(SB)/8, $libc_fdopendir_trampoline<>(SB)
TEXT libc_getgroups_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getgroups(SB)
-
GLOBL ·libc_getgroups_trampoline_addr(SB), RODATA, $8
DATA ·libc_getgroups_trampoline_addr(SB)/8, $libc_getgroups_trampoline<>(SB)
TEXT libc_setgroups_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setgroups(SB)
-
GLOBL ·libc_setgroups_trampoline_addr(SB), RODATA, $8
DATA ·libc_setgroups_trampoline_addr(SB)/8, $libc_setgroups_trampoline<>(SB)
TEXT libc_wait4_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_wait4(SB)
-
GLOBL ·libc_wait4_trampoline_addr(SB), RODATA, $8
DATA ·libc_wait4_trampoline_addr(SB)/8, $libc_wait4_trampoline<>(SB)
TEXT libc_accept_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_accept(SB)
-
GLOBL ·libc_accept_trampoline_addr(SB), RODATA, $8
DATA ·libc_accept_trampoline_addr(SB)/8, $libc_accept_trampoline<>(SB)
TEXT libc_bind_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_bind(SB)
-
GLOBL ·libc_bind_trampoline_addr(SB), RODATA, $8
DATA ·libc_bind_trampoline_addr(SB)/8, $libc_bind_trampoline<>(SB)
TEXT libc_connect_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_connect(SB)
-
GLOBL ·libc_connect_trampoline_addr(SB), RODATA, $8
DATA ·libc_connect_trampoline_addr(SB)/8, $libc_connect_trampoline<>(SB)
TEXT libc_socket_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_socket(SB)
-
GLOBL ·libc_socket_trampoline_addr(SB), RODATA, $8
DATA ·libc_socket_trampoline_addr(SB)/8, $libc_socket_trampoline<>(SB)
TEXT libc_getsockopt_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getsockopt(SB)
-
GLOBL ·libc_getsockopt_trampoline_addr(SB), RODATA, $8
DATA ·libc_getsockopt_trampoline_addr(SB)/8, $libc_getsockopt_trampoline<>(SB)
TEXT libc_setsockopt_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setsockopt(SB)
-
GLOBL ·libc_setsockopt_trampoline_addr(SB), RODATA, $8
DATA ·libc_setsockopt_trampoline_addr(SB)/8, $libc_setsockopt_trampoline<>(SB)
TEXT libc_getpeername_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpeername(SB)
-
GLOBL ·libc_getpeername_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpeername_trampoline_addr(SB)/8, $libc_getpeername_trampoline<>(SB)
TEXT libc_getsockname_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getsockname(SB)
-
GLOBL ·libc_getsockname_trampoline_addr(SB), RODATA, $8
DATA ·libc_getsockname_trampoline_addr(SB)/8, $libc_getsockname_trampoline<>(SB)
TEXT libc_shutdown_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shutdown(SB)
-
GLOBL ·libc_shutdown_trampoline_addr(SB), RODATA, $8
DATA ·libc_shutdown_trampoline_addr(SB)/8, $libc_shutdown_trampoline<>(SB)
TEXT libc_socketpair_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_socketpair(SB)
-
GLOBL ·libc_socketpair_trampoline_addr(SB), RODATA, $8
DATA ·libc_socketpair_trampoline_addr(SB)/8, $libc_socketpair_trampoline<>(SB)
TEXT libc_recvfrom_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_recvfrom(SB)
-
GLOBL ·libc_recvfrom_trampoline_addr(SB), RODATA, $8
DATA ·libc_recvfrom_trampoline_addr(SB)/8, $libc_recvfrom_trampoline<>(SB)
TEXT libc_sendto_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sendto(SB)
-
GLOBL ·libc_sendto_trampoline_addr(SB), RODATA, $8
DATA ·libc_sendto_trampoline_addr(SB)/8, $libc_sendto_trampoline<>(SB)
TEXT libc_recvmsg_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_recvmsg(SB)
-
GLOBL ·libc_recvmsg_trampoline_addr(SB), RODATA, $8
DATA ·libc_recvmsg_trampoline_addr(SB)/8, $libc_recvmsg_trampoline<>(SB)
TEXT libc_sendmsg_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sendmsg(SB)
-
GLOBL ·libc_sendmsg_trampoline_addr(SB), RODATA, $8
DATA ·libc_sendmsg_trampoline_addr(SB)/8, $libc_sendmsg_trampoline<>(SB)
TEXT libc_kevent_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_kevent(SB)
-
GLOBL ·libc_kevent_trampoline_addr(SB), RODATA, $8
DATA ·libc_kevent_trampoline_addr(SB)/8, $libc_kevent_trampoline<>(SB)
TEXT libc_utimes_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimes(SB)
-
GLOBL ·libc_utimes_trampoline_addr(SB), RODATA, $8
DATA ·libc_utimes_trampoline_addr(SB)/8, $libc_utimes_trampoline<>(SB)
TEXT libc_futimes_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_futimes(SB)
-
GLOBL ·libc_futimes_trampoline_addr(SB), RODATA, $8
DATA ·libc_futimes_trampoline_addr(SB)/8, $libc_futimes_trampoline<>(SB)
TEXT libc_poll_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_poll(SB)
-
GLOBL ·libc_poll_trampoline_addr(SB), RODATA, $8
DATA ·libc_poll_trampoline_addr(SB)/8, $libc_poll_trampoline<>(SB)
TEXT libc_madvise_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_madvise(SB)
-
GLOBL ·libc_madvise_trampoline_addr(SB), RODATA, $8
DATA ·libc_madvise_trampoline_addr(SB)/8, $libc_madvise_trampoline<>(SB)
TEXT libc_mlock_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mlock(SB)
-
GLOBL ·libc_mlock_trampoline_addr(SB), RODATA, $8
DATA ·libc_mlock_trampoline_addr(SB)/8, $libc_mlock_trampoline<>(SB)
TEXT libc_mlockall_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mlockall(SB)
-
GLOBL ·libc_mlockall_trampoline_addr(SB), RODATA, $8
DATA ·libc_mlockall_trampoline_addr(SB)/8, $libc_mlockall_trampoline<>(SB)
TEXT libc_mprotect_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mprotect(SB)
-
GLOBL ·libc_mprotect_trampoline_addr(SB), RODATA, $8
DATA ·libc_mprotect_trampoline_addr(SB)/8, $libc_mprotect_trampoline<>(SB)
TEXT libc_msync_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_msync(SB)
-
GLOBL ·libc_msync_trampoline_addr(SB), RODATA, $8
DATA ·libc_msync_trampoline_addr(SB)/8, $libc_msync_trampoline<>(SB)
TEXT libc_munlock_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_munlock(SB)
-
GLOBL ·libc_munlock_trampoline_addr(SB), RODATA, $8
DATA ·libc_munlock_trampoline_addr(SB)/8, $libc_munlock_trampoline<>(SB)
TEXT libc_munlockall_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_munlockall(SB)
-
GLOBL ·libc_munlockall_trampoline_addr(SB), RODATA, $8
DATA ·libc_munlockall_trampoline_addr(SB)/8, $libc_munlockall_trampoline<>(SB)
TEXT libc_closedir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_closedir(SB)
-
GLOBL ·libc_closedir_trampoline_addr(SB), RODATA, $8
DATA ·libc_closedir_trampoline_addr(SB)/8, $libc_closedir_trampoline<>(SB)
TEXT libc_readdir_r_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_readdir_r(SB)
-
GLOBL ·libc_readdir_r_trampoline_addr(SB), RODATA, $8
DATA ·libc_readdir_r_trampoline_addr(SB)/8, $libc_readdir_r_trampoline<>(SB)
TEXT libc_pipe_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_pipe(SB)
-
GLOBL ·libc_pipe_trampoline_addr(SB), RODATA, $8
DATA ·libc_pipe_trampoline_addr(SB)/8, $libc_pipe_trampoline<>(SB)
TEXT libc_getxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getxattr(SB)
-
GLOBL ·libc_getxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_getxattr_trampoline_addr(SB)/8, $libc_getxattr_trampoline<>(SB)
TEXT libc_fgetxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fgetxattr(SB)
-
GLOBL ·libc_fgetxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_fgetxattr_trampoline_addr(SB)/8, $libc_fgetxattr_trampoline<>(SB)
TEXT libc_setxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setxattr(SB)
-
GLOBL ·libc_setxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_setxattr_trampoline_addr(SB)/8, $libc_setxattr_trampoline<>(SB)
TEXT libc_fsetxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fsetxattr(SB)
-
GLOBL ·libc_fsetxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_fsetxattr_trampoline_addr(SB)/8, $libc_fsetxattr_trampoline<>(SB)
TEXT libc_removexattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_removexattr(SB)
-
GLOBL ·libc_removexattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_removexattr_trampoline_addr(SB)/8, $libc_removexattr_trampoline<>(SB)
TEXT libc_fremovexattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fremovexattr(SB)
-
GLOBL ·libc_fremovexattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_fremovexattr_trampoline_addr(SB)/8, $libc_fremovexattr_trampoline<>(SB)
TEXT libc_listxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_listxattr(SB)
-
GLOBL ·libc_listxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_listxattr_trampoline_addr(SB)/8, $libc_listxattr_trampoline<>(SB)
TEXT libc_flistxattr_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_flistxattr(SB)
-
GLOBL ·libc_flistxattr_trampoline_addr(SB), RODATA, $8
DATA ·libc_flistxattr_trampoline_addr(SB)/8, $libc_flistxattr_trampoline<>(SB)
TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimensat(SB)
-
GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8
DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB)
TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fcntl(SB)
-
GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8
DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB)
TEXT libc_kill_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_kill(SB)
-
GLOBL ·libc_kill_trampoline_addr(SB), RODATA, $8
DATA ·libc_kill_trampoline_addr(SB)/8, $libc_kill_trampoline<>(SB)
TEXT libc_ioctl_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ioctl(SB)
-
GLOBL ·libc_ioctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_ioctl_trampoline_addr(SB)/8, $libc_ioctl_trampoline<>(SB)
TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sysctl(SB)
-
GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sendfile(SB)
-
GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8
DATA ·libc_sendfile_trampoline_addr(SB)/8, $libc_sendfile_trampoline<>(SB)
TEXT libc_shmat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shmat(SB)
-
GLOBL ·libc_shmat_trampoline_addr(SB), RODATA, $8
DATA ·libc_shmat_trampoline_addr(SB)/8, $libc_shmat_trampoline<>(SB)
TEXT libc_shmctl_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shmctl(SB)
-
GLOBL ·libc_shmctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_shmctl_trampoline_addr(SB)/8, $libc_shmctl_trampoline<>(SB)
TEXT libc_shmdt_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shmdt(SB)
-
GLOBL ·libc_shmdt_trampoline_addr(SB), RODATA, $8
DATA ·libc_shmdt_trampoline_addr(SB)/8, $libc_shmdt_trampoline<>(SB)
TEXT libc_shmget_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_shmget(SB)
-
GLOBL ·libc_shmget_trampoline_addr(SB), RODATA, $8
DATA ·libc_shmget_trampoline_addr(SB)/8, $libc_shmget_trampoline<>(SB)
TEXT libc_access_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_access(SB)
-
GLOBL ·libc_access_trampoline_addr(SB), RODATA, $8
DATA ·libc_access_trampoline_addr(SB)/8, $libc_access_trampoline<>(SB)
TEXT libc_adjtime_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_adjtime(SB)
-
GLOBL ·libc_adjtime_trampoline_addr(SB), RODATA, $8
DATA ·libc_adjtime_trampoline_addr(SB)/8, $libc_adjtime_trampoline<>(SB)
TEXT libc_chdir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chdir(SB)
-
GLOBL ·libc_chdir_trampoline_addr(SB), RODATA, $8
DATA ·libc_chdir_trampoline_addr(SB)/8, $libc_chdir_trampoline<>(SB)
TEXT libc_chflags_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chflags(SB)
-
GLOBL ·libc_chflags_trampoline_addr(SB), RODATA, $8
DATA ·libc_chflags_trampoline_addr(SB)/8, $libc_chflags_trampoline<>(SB)
TEXT libc_chmod_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chmod(SB)
-
GLOBL ·libc_chmod_trampoline_addr(SB), RODATA, $8
DATA ·libc_chmod_trampoline_addr(SB)/8, $libc_chmod_trampoline<>(SB)
TEXT libc_chown_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chown(SB)
-
GLOBL ·libc_chown_trampoline_addr(SB), RODATA, $8
DATA ·libc_chown_trampoline_addr(SB)/8, $libc_chown_trampoline<>(SB)
TEXT libc_chroot_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_chroot(SB)
-
GLOBL ·libc_chroot_trampoline_addr(SB), RODATA, $8
DATA ·libc_chroot_trampoline_addr(SB)/8, $libc_chroot_trampoline<>(SB)
TEXT libc_clock_gettime_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_clock_gettime(SB)
-
GLOBL ·libc_clock_gettime_trampoline_addr(SB), RODATA, $8
DATA ·libc_clock_gettime_trampoline_addr(SB)/8, $libc_clock_gettime_trampoline<>(SB)
TEXT libc_close_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_close(SB)
-
GLOBL ·libc_close_trampoline_addr(SB), RODATA, $8
DATA ·libc_close_trampoline_addr(SB)/8, $libc_close_trampoline<>(SB)
TEXT libc_clonefile_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_clonefile(SB)
-
GLOBL ·libc_clonefile_trampoline_addr(SB), RODATA, $8
DATA ·libc_clonefile_trampoline_addr(SB)/8, $libc_clonefile_trampoline<>(SB)
TEXT libc_clonefileat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_clonefileat(SB)
-
GLOBL ·libc_clonefileat_trampoline_addr(SB), RODATA, $8
DATA ·libc_clonefileat_trampoline_addr(SB)/8, $libc_clonefileat_trampoline<>(SB)
TEXT libc_dup_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_dup(SB)
-
GLOBL ·libc_dup_trampoline_addr(SB), RODATA, $8
DATA ·libc_dup_trampoline_addr(SB)/8, $libc_dup_trampoline<>(SB)
TEXT libc_dup2_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_dup2(SB)
-
GLOBL ·libc_dup2_trampoline_addr(SB), RODATA, $8
DATA ·libc_dup2_trampoline_addr(SB)/8, $libc_dup2_trampoline<>(SB)
TEXT libc_exchangedata_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_exchangedata(SB)
-
GLOBL ·libc_exchangedata_trampoline_addr(SB), RODATA, $8
DATA ·libc_exchangedata_trampoline_addr(SB)/8, $libc_exchangedata_trampoline<>(SB)
TEXT libc_exit_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_exit(SB)
-
GLOBL ·libc_exit_trampoline_addr(SB), RODATA, $8
DATA ·libc_exit_trampoline_addr(SB)/8, $libc_exit_trampoline<>(SB)
TEXT libc_faccessat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_faccessat(SB)
-
GLOBL ·libc_faccessat_trampoline_addr(SB), RODATA, $8
DATA ·libc_faccessat_trampoline_addr(SB)/8, $libc_faccessat_trampoline<>(SB)
TEXT libc_fchdir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchdir(SB)
-
GLOBL ·libc_fchdir_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchdir_trampoline_addr(SB)/8, $libc_fchdir_trampoline<>(SB)
TEXT libc_fchflags_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchflags(SB)
-
GLOBL ·libc_fchflags_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchflags_trampoline_addr(SB)/8, $libc_fchflags_trampoline<>(SB)
TEXT libc_fchmod_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchmod(SB)
-
GLOBL ·libc_fchmod_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchmod_trampoline_addr(SB)/8, $libc_fchmod_trampoline<>(SB)
TEXT libc_fchmodat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchmodat(SB)
-
GLOBL ·libc_fchmodat_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchmodat_trampoline_addr(SB)/8, $libc_fchmodat_trampoline<>(SB)
TEXT libc_fchown_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchown(SB)
-
GLOBL ·libc_fchown_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchown_trampoline_addr(SB)/8, $libc_fchown_trampoline<>(SB)
TEXT libc_fchownat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fchownat(SB)
-
GLOBL ·libc_fchownat_trampoline_addr(SB), RODATA, $8
DATA ·libc_fchownat_trampoline_addr(SB)/8, $libc_fchownat_trampoline<>(SB)
TEXT libc_fclonefileat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fclonefileat(SB)
-
GLOBL ·libc_fclonefileat_trampoline_addr(SB), RODATA, $8
DATA ·libc_fclonefileat_trampoline_addr(SB)/8, $libc_fclonefileat_trampoline<>(SB)
TEXT libc_flock_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_flock(SB)
-
GLOBL ·libc_flock_trampoline_addr(SB), RODATA, $8
DATA ·libc_flock_trampoline_addr(SB)/8, $libc_flock_trampoline<>(SB)
TEXT libc_fpathconf_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fpathconf(SB)
-
GLOBL ·libc_fpathconf_trampoline_addr(SB), RODATA, $8
DATA ·libc_fpathconf_trampoline_addr(SB)/8, $libc_fpathconf_trampoline<>(SB)
TEXT libc_fsync_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fsync(SB)
-
GLOBL ·libc_fsync_trampoline_addr(SB), RODATA, $8
DATA ·libc_fsync_trampoline_addr(SB)/8, $libc_fsync_trampoline<>(SB)
TEXT libc_ftruncate_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ftruncate(SB)
-
GLOBL ·libc_ftruncate_trampoline_addr(SB), RODATA, $8
DATA ·libc_ftruncate_trampoline_addr(SB)/8, $libc_ftruncate_trampoline<>(SB)
TEXT libc_getcwd_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getcwd(SB)
-
GLOBL ·libc_getcwd_trampoline_addr(SB), RODATA, $8
DATA ·libc_getcwd_trampoline_addr(SB)/8, $libc_getcwd_trampoline<>(SB)
TEXT libc_getdtablesize_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getdtablesize(SB)
-
GLOBL ·libc_getdtablesize_trampoline_addr(SB), RODATA, $8
DATA ·libc_getdtablesize_trampoline_addr(SB)/8, $libc_getdtablesize_trampoline<>(SB)
TEXT libc_getegid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getegid(SB)
-
GLOBL ·libc_getegid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getegid_trampoline_addr(SB)/8, $libc_getegid_trampoline<>(SB)
TEXT libc_geteuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_geteuid(SB)
-
GLOBL ·libc_geteuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_geteuid_trampoline_addr(SB)/8, $libc_geteuid_trampoline<>(SB)
TEXT libc_getgid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getgid(SB)
-
GLOBL ·libc_getgid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getgid_trampoline_addr(SB)/8, $libc_getgid_trampoline<>(SB)
TEXT libc_getpgid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpgid(SB)
-
GLOBL ·libc_getpgid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpgid_trampoline_addr(SB)/8, $libc_getpgid_trampoline<>(SB)
TEXT libc_getpgrp_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpgrp(SB)
-
GLOBL ·libc_getpgrp_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpgrp_trampoline_addr(SB)/8, $libc_getpgrp_trampoline<>(SB)
TEXT libc_getpid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpid(SB)
-
GLOBL ·libc_getpid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpid_trampoline_addr(SB)/8, $libc_getpid_trampoline<>(SB)
TEXT libc_getppid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getppid(SB)
-
GLOBL ·libc_getppid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getppid_trampoline_addr(SB)/8, $libc_getppid_trampoline<>(SB)
TEXT libc_getpriority_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getpriority(SB)
-
GLOBL ·libc_getpriority_trampoline_addr(SB), RODATA, $8
DATA ·libc_getpriority_trampoline_addr(SB)/8, $libc_getpriority_trampoline<>(SB)
TEXT libc_getrlimit_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getrlimit(SB)
-
GLOBL ·libc_getrlimit_trampoline_addr(SB), RODATA, $8
DATA ·libc_getrlimit_trampoline_addr(SB)/8, $libc_getrlimit_trampoline<>(SB)
TEXT libc_getrusage_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getrusage(SB)
-
GLOBL ·libc_getrusage_trampoline_addr(SB), RODATA, $8
DATA ·libc_getrusage_trampoline_addr(SB)/8, $libc_getrusage_trampoline<>(SB)
TEXT libc_getsid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getsid(SB)
-
GLOBL ·libc_getsid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getsid_trampoline_addr(SB)/8, $libc_getsid_trampoline<>(SB)
TEXT libc_gettimeofday_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_gettimeofday(SB)
-
GLOBL ·libc_gettimeofday_trampoline_addr(SB), RODATA, $8
DATA ·libc_gettimeofday_trampoline_addr(SB)/8, $libc_gettimeofday_trampoline<>(SB)
TEXT libc_getuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getuid(SB)
-
GLOBL ·libc_getuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_getuid_trampoline_addr(SB)/8, $libc_getuid_trampoline<>(SB)
TEXT libc_issetugid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_issetugid(SB)
-
GLOBL ·libc_issetugid_trampoline_addr(SB), RODATA, $8
DATA ·libc_issetugid_trampoline_addr(SB)/8, $libc_issetugid_trampoline<>(SB)
TEXT libc_kqueue_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_kqueue(SB)
-
GLOBL ·libc_kqueue_trampoline_addr(SB), RODATA, $8
DATA ·libc_kqueue_trampoline_addr(SB)/8, $libc_kqueue_trampoline<>(SB)
TEXT libc_lchown_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_lchown(SB)
-
GLOBL ·libc_lchown_trampoline_addr(SB), RODATA, $8
DATA ·libc_lchown_trampoline_addr(SB)/8, $libc_lchown_trampoline<>(SB)
TEXT libc_link_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_link(SB)
-
GLOBL ·libc_link_trampoline_addr(SB), RODATA, $8
DATA ·libc_link_trampoline_addr(SB)/8, $libc_link_trampoline<>(SB)
TEXT libc_linkat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_linkat(SB)
-
GLOBL ·libc_linkat_trampoline_addr(SB), RODATA, $8
DATA ·libc_linkat_trampoline_addr(SB)/8, $libc_linkat_trampoline<>(SB)
TEXT libc_listen_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_listen(SB)
-
GLOBL ·libc_listen_trampoline_addr(SB), RODATA, $8
DATA ·libc_listen_trampoline_addr(SB)/8, $libc_listen_trampoline<>(SB)
TEXT libc_mkdir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mkdir(SB)
-
GLOBL ·libc_mkdir_trampoline_addr(SB), RODATA, $8
DATA ·libc_mkdir_trampoline_addr(SB)/8, $libc_mkdir_trampoline<>(SB)
TEXT libc_mkdirat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mkdirat(SB)
-
GLOBL ·libc_mkdirat_trampoline_addr(SB), RODATA, $8
DATA ·libc_mkdirat_trampoline_addr(SB)/8, $libc_mkdirat_trampoline<>(SB)
TEXT libc_mkfifo_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mkfifo(SB)
-
GLOBL ·libc_mkfifo_trampoline_addr(SB), RODATA, $8
DATA ·libc_mkfifo_trampoline_addr(SB)/8, $libc_mkfifo_trampoline<>(SB)
TEXT libc_mknod_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mknod(SB)
-
GLOBL ·libc_mknod_trampoline_addr(SB), RODATA, $8
DATA ·libc_mknod_trampoline_addr(SB)/8, $libc_mknod_trampoline<>(SB)
TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mount(SB)
-
GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8
DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB)
TEXT libc_open_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_open(SB)
-
GLOBL ·libc_open_trampoline_addr(SB), RODATA, $8
DATA ·libc_open_trampoline_addr(SB)/8, $libc_open_trampoline<>(SB)
TEXT libc_openat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_openat(SB)
-
GLOBL ·libc_openat_trampoline_addr(SB), RODATA, $8
DATA ·libc_openat_trampoline_addr(SB)/8, $libc_openat_trampoline<>(SB)
TEXT libc_pathconf_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_pathconf(SB)
-
GLOBL ·libc_pathconf_trampoline_addr(SB), RODATA, $8
DATA ·libc_pathconf_trampoline_addr(SB)/8, $libc_pathconf_trampoline<>(SB)
TEXT libc_pread_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_pread(SB)
-
GLOBL ·libc_pread_trampoline_addr(SB), RODATA, $8
DATA ·libc_pread_trampoline_addr(SB)/8, $libc_pread_trampoline<>(SB)
TEXT libc_pwrite_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_pwrite(SB)
-
GLOBL ·libc_pwrite_trampoline_addr(SB), RODATA, $8
DATA ·libc_pwrite_trampoline_addr(SB)/8, $libc_pwrite_trampoline<>(SB)
TEXT libc_read_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_read(SB)
-
GLOBL ·libc_read_trampoline_addr(SB), RODATA, $8
DATA ·libc_read_trampoline_addr(SB)/8, $libc_read_trampoline<>(SB)
TEXT libc_readlink_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_readlink(SB)
-
GLOBL ·libc_readlink_trampoline_addr(SB), RODATA, $8
DATA ·libc_readlink_trampoline_addr(SB)/8, $libc_readlink_trampoline<>(SB)
TEXT libc_readlinkat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_readlinkat(SB)
-
GLOBL ·libc_readlinkat_trampoline_addr(SB), RODATA, $8
DATA ·libc_readlinkat_trampoline_addr(SB)/8, $libc_readlinkat_trampoline<>(SB)
TEXT libc_rename_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_rename(SB)
-
GLOBL ·libc_rename_trampoline_addr(SB), RODATA, $8
DATA ·libc_rename_trampoline_addr(SB)/8, $libc_rename_trampoline<>(SB)
TEXT libc_renameat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_renameat(SB)
-
GLOBL ·libc_renameat_trampoline_addr(SB), RODATA, $8
DATA ·libc_renameat_trampoline_addr(SB)/8, $libc_renameat_trampoline<>(SB)
TEXT libc_revoke_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_revoke(SB)
-
GLOBL ·libc_revoke_trampoline_addr(SB), RODATA, $8
DATA ·libc_revoke_trampoline_addr(SB)/8, $libc_revoke_trampoline<>(SB)
TEXT libc_rmdir_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_rmdir(SB)
-
GLOBL ·libc_rmdir_trampoline_addr(SB), RODATA, $8
DATA ·libc_rmdir_trampoline_addr(SB)/8, $libc_rmdir_trampoline<>(SB)
TEXT libc_lseek_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_lseek(SB)
-
GLOBL ·libc_lseek_trampoline_addr(SB), RODATA, $8
DATA ·libc_lseek_trampoline_addr(SB)/8, $libc_lseek_trampoline<>(SB)
TEXT libc_select_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_select(SB)
-
GLOBL ·libc_select_trampoline_addr(SB), RODATA, $8
DATA ·libc_select_trampoline_addr(SB)/8, $libc_select_trampoline<>(SB)
@@ -712,192 +595,160 @@ DATA ·libc_setattrlist_trampoline_addr(SB)/8, $libc_setattrlist_trampoline<>(SB
TEXT libc_setegid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setegid(SB)
-
GLOBL ·libc_setegid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setegid_trampoline_addr(SB)/8, $libc_setegid_trampoline<>(SB)
TEXT libc_seteuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_seteuid(SB)
-
GLOBL ·libc_seteuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_seteuid_trampoline_addr(SB)/8, $libc_seteuid_trampoline<>(SB)
TEXT libc_setgid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setgid(SB)
-
GLOBL ·libc_setgid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setgid_trampoline_addr(SB)/8, $libc_setgid_trampoline<>(SB)
TEXT libc_setlogin_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setlogin(SB)
-
GLOBL ·libc_setlogin_trampoline_addr(SB), RODATA, $8
DATA ·libc_setlogin_trampoline_addr(SB)/8, $libc_setlogin_trampoline<>(SB)
TEXT libc_setpgid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setpgid(SB)
-
GLOBL ·libc_setpgid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setpgid_trampoline_addr(SB)/8, $libc_setpgid_trampoline<>(SB)
TEXT libc_setpriority_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setpriority(SB)
-
GLOBL ·libc_setpriority_trampoline_addr(SB), RODATA, $8
DATA ·libc_setpriority_trampoline_addr(SB)/8, $libc_setpriority_trampoline<>(SB)
TEXT libc_setprivexec_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setprivexec(SB)
-
GLOBL ·libc_setprivexec_trampoline_addr(SB), RODATA, $8
DATA ·libc_setprivexec_trampoline_addr(SB)/8, $libc_setprivexec_trampoline<>(SB)
TEXT libc_setregid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setregid(SB)
-
GLOBL ·libc_setregid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setregid_trampoline_addr(SB)/8, $libc_setregid_trampoline<>(SB)
TEXT libc_setreuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setreuid(SB)
-
GLOBL ·libc_setreuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setreuid_trampoline_addr(SB)/8, $libc_setreuid_trampoline<>(SB)
TEXT libc_setsid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setsid(SB)
-
GLOBL ·libc_setsid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setsid_trampoline_addr(SB)/8, $libc_setsid_trampoline<>(SB)
TEXT libc_settimeofday_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_settimeofday(SB)
-
GLOBL ·libc_settimeofday_trampoline_addr(SB), RODATA, $8
DATA ·libc_settimeofday_trampoline_addr(SB)/8, $libc_settimeofday_trampoline<>(SB)
TEXT libc_setuid_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_setuid(SB)
-
GLOBL ·libc_setuid_trampoline_addr(SB), RODATA, $8
DATA ·libc_setuid_trampoline_addr(SB)/8, $libc_setuid_trampoline<>(SB)
TEXT libc_symlink_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_symlink(SB)
-
GLOBL ·libc_symlink_trampoline_addr(SB), RODATA, $8
DATA ·libc_symlink_trampoline_addr(SB)/8, $libc_symlink_trampoline<>(SB)
TEXT libc_symlinkat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_symlinkat(SB)
-
GLOBL ·libc_symlinkat_trampoline_addr(SB), RODATA, $8
DATA ·libc_symlinkat_trampoline_addr(SB)/8, $libc_symlinkat_trampoline<>(SB)
TEXT libc_sync_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sync(SB)
-
GLOBL ·libc_sync_trampoline_addr(SB), RODATA, $8
DATA ·libc_sync_trampoline_addr(SB)/8, $libc_sync_trampoline<>(SB)
TEXT libc_truncate_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_truncate(SB)
-
GLOBL ·libc_truncate_trampoline_addr(SB), RODATA, $8
DATA ·libc_truncate_trampoline_addr(SB)/8, $libc_truncate_trampoline<>(SB)
TEXT libc_umask_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_umask(SB)
-
GLOBL ·libc_umask_trampoline_addr(SB), RODATA, $8
DATA ·libc_umask_trampoline_addr(SB)/8, $libc_umask_trampoline<>(SB)
TEXT libc_undelete_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_undelete(SB)
-
GLOBL ·libc_undelete_trampoline_addr(SB), RODATA, $8
DATA ·libc_undelete_trampoline_addr(SB)/8, $libc_undelete_trampoline<>(SB)
TEXT libc_unlink_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_unlink(SB)
-
GLOBL ·libc_unlink_trampoline_addr(SB), RODATA, $8
DATA ·libc_unlink_trampoline_addr(SB)/8, $libc_unlink_trampoline<>(SB)
TEXT libc_unlinkat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_unlinkat(SB)
-
GLOBL ·libc_unlinkat_trampoline_addr(SB), RODATA, $8
DATA ·libc_unlinkat_trampoline_addr(SB)/8, $libc_unlinkat_trampoline<>(SB)
TEXT libc_unmount_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_unmount(SB)
-
GLOBL ·libc_unmount_trampoline_addr(SB), RODATA, $8
DATA ·libc_unmount_trampoline_addr(SB)/8, $libc_unmount_trampoline<>(SB)
TEXT libc_write_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_write(SB)
-
GLOBL ·libc_write_trampoline_addr(SB), RODATA, $8
DATA ·libc_write_trampoline_addr(SB)/8, $libc_write_trampoline<>(SB)
TEXT libc_mmap_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_mmap(SB)
-
GLOBL ·libc_mmap_trampoline_addr(SB), RODATA, $8
DATA ·libc_mmap_trampoline_addr(SB)/8, $libc_mmap_trampoline<>(SB)
TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_munmap(SB)
-
GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8
DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB)
TEXT libc_fstat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fstat(SB)
-
GLOBL ·libc_fstat_trampoline_addr(SB), RODATA, $8
DATA ·libc_fstat_trampoline_addr(SB)/8, $libc_fstat_trampoline<>(SB)
TEXT libc_fstatat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fstatat(SB)
-
GLOBL ·libc_fstatat_trampoline_addr(SB), RODATA, $8
DATA ·libc_fstatat_trampoline_addr(SB)/8, $libc_fstatat_trampoline<>(SB)
TEXT libc_fstatfs_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_fstatfs(SB)
-
GLOBL ·libc_fstatfs_trampoline_addr(SB), RODATA, $8
DATA ·libc_fstatfs_trampoline_addr(SB)/8, $libc_fstatfs_trampoline<>(SB)
TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_getfsstat(SB)
-
GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8
DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB)
TEXT libc_lstat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_lstat(SB)
-
GLOBL ·libc_lstat_trampoline_addr(SB), RODATA, $8
DATA ·libc_lstat_trampoline_addr(SB)/8, $libc_lstat_trampoline<>(SB)
TEXT libc_ptrace_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ptrace(SB)
-
GLOBL ·libc_ptrace_trampoline_addr(SB), RODATA, $8
DATA ·libc_ptrace_trampoline_addr(SB)/8, $libc_ptrace_trampoline<>(SB)
TEXT libc_stat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_stat(SB)
-
GLOBL ·libc_stat_trampoline_addr(SB), RODATA, $8
DATA ·libc_stat_trampoline_addr(SB)/8, $libc_stat_trampoline<>(SB)
TEXT libc_statfs_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_statfs(SB)
-
GLOBL ·libc_statfs_trampoline_addr(SB), RODATA, $8
DATA ·libc_statfs_trampoline_addr(SB)/8, $libc_statfs_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_dragonfly_amd64.go
index 0eabac7ad..aad65fc79 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_dragonfly_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_dragonfly_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build dragonfly && amd64
-// +build dragonfly,amd64
package unix
@@ -1642,28 +1641,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) {
r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0)
nfd = int(r0)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go
index ee313eb00..c0096391a 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build freebsd && 386
-// +build freebsd,386
package unix
@@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) {
r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0)
nfd = int(r0)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go
index 4c986e448..7664df749 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build freebsd && amd64
-// +build freebsd,amd64
package unix
@@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) {
r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0)
nfd = int(r0)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go
index 555216944..ae099182c 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build freebsd && arm
-// +build freebsd,arm
package unix
@@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) {
r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0)
nfd = int(r0)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go
index 67a226fbf..11fd5d45b 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build freebsd && arm64
-// +build freebsd,arm64
package unix
@@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) {
r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0)
nfd = int(r0)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_riscv64.go
index f0b9ddaaa..c3d2d6530 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build freebsd && riscv64
-// +build freebsd,riscv64
package unix
@@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) {
r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0)
nfd = int(r0)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_illumos_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_illumos_amd64.go
index b57c7050d..c698cbc01 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_illumos_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_illumos_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build illumos && amd64
-// +build illumos,amd64
package unix
@@ -40,7 +39,7 @@ func readv(fd int, iovs []Iovec) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procreadv)), 3, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(iovs)), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -55,7 +54,7 @@ func preadv(fd int, iovs []Iovec, off int64) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpreadv)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(iovs)), uintptr(off), 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -70,7 +69,7 @@ func writev(fd int, iovs []Iovec) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procwritev)), 3, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(iovs)), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -85,7 +84,7 @@ func pwritev(fd int, iovs []Iovec, off int64) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpwritev)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(iovs)), uintptr(off), 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -96,7 +95,7 @@ func accept4(s int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (fd int,
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procaccept4)), 4, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0)
fd = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux.go b/vendor/golang.org/x/sys/unix/zsyscall_linux.go
index a07321bed..87d8612a1 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux.go
@@ -1,7 +1,6 @@
// Code generated by mkmerge; DO NOT EDIT.
//go:build linux
-// +build linux
package unix
@@ -38,6 +37,21 @@ func fchmodat(dirfd int, path string, mode uint32) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func fchmodat2(dirfd int, path string, mode uint32, flags int) (err error) {
+ var _p0 *byte
+ _p0, err = BytePtrFromString(path)
+ if err != nil {
+ return
+ }
+ _, _, e1 := Syscall6(SYS_FCHMODAT2, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ioctl(fd int, req uint, arg uintptr) (err error) {
_, _, e1 := Syscall(SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(arg))
if e1 != 0 {
@@ -892,6 +906,16 @@ func Fspick(dirfd int, pathName string, flags int) (fd int, err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func fsconfig(fd int, cmd uint, key *byte, value *byte, aux int) (err error) {
+ _, _, e1 := Syscall6(SYS_FSCONFIG, uintptr(fd), uintptr(cmd), uintptr(unsafe.Pointer(key)), uintptr(unsafe.Pointer(value)), uintptr(aux), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func Getdents(fd int, buf []byte) (n int, err error) {
var _p0 unsafe.Pointer
if len(buf) > 0 {
@@ -1734,28 +1758,6 @@ func exitThread(code int) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, p *byte, np int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(p)), uintptr(np))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, p *byte, np int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(p)), uintptr(np))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func readv(fd int, iovs []Iovec) (n int, err error) {
var _p0 unsafe.Pointer
if len(iovs) > 0 {
@@ -2197,3 +2199,33 @@ func getresgid(rgid *_C_int, egid *_C_int, sgid *_C_int) {
RawSyscallNoError(SYS_GETRESGID, uintptr(unsafe.Pointer(rgid)), uintptr(unsafe.Pointer(egid)), uintptr(unsafe.Pointer(sgid)))
return
}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func schedSetattr(pid int, attr *SchedAttr, flags uint) (err error) {
+ _, _, e1 := Syscall(SYS_SCHED_SETATTR, uintptr(pid), uintptr(unsafe.Pointer(attr)), uintptr(flags))
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func schedGetattr(pid int, attr *SchedAttr, size uint, flags uint) (err error) {
+ _, _, e1 := Syscall6(SYS_SCHED_GETATTR, uintptr(pid), uintptr(unsafe.Pointer(attr)), uintptr(size), uintptr(flags), 0, 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error) {
+ _, _, e1 := Syscall6(SYS_CACHESTAT, uintptr(fd), uintptr(unsafe.Pointer(crange)), uintptr(unsafe.Pointer(cstat)), uintptr(flags), 0, 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go
index 07b549cc2..4def3e9fc 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && 386
-// +build linux,386
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
index 5f481bf83..fef2bc8ba 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && amd64
-// +build linux,amd64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go
index 824cd52c7..a9fd76a88 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && arm
-// +build linux,arm
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
index e77aecfe9..460065028 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && arm64
-// +build linux,arm64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_loong64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_loong64.go
index 806ffd1e1..c8987d264 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_loong64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && loong64
-// +build linux,loong64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go
index 961a3afb7..921f43061 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && mips
-// +build linux,mips
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go
index ed05005e9..44f067829 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && mips64
-// +build linux,mips64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go
index d365b718f..e7fa0abf0 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && mips64le
-// +build linux,mips64le
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go
index c3f1b8bbd..8c5125675 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && mipsle
-// +build linux,mipsle
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go
index a6574cf98..7392fd45e 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && ppc
-// +build linux,ppc
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go
index f40990264..41180434e 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && ppc64
-// +build linux,ppc64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go
index 9dfcc2997..40c6ce7ae 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && ppc64le
-// +build linux,ppc64le
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go
index 0ab4f2ed7..2cfe34adb 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && riscv64
-// +build linux,riscv64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go
index 6cde32237..61e6f0709 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && s390x
-// +build linux,s390x
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go
index 5253d65bf..834b84204 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build linux && sparc64
-// +build linux,sparc64
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_386.go
index 35f499b32..e91ebc14a 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build netbsd && 386
-// +build netbsd,386
package unix
@@ -1824,28 +1823,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) {
var _p0 *byte
_p0, err = BytePtrFromString(path)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_amd64.go
index 3cda65b0d..be28babbc 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build netbsd && amd64
-// +build netbsd,amd64
package unix
@@ -1824,28 +1823,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) {
var _p0 *byte
_p0, err = BytePtrFromString(path)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm.go
index 1e1fea902..fb587e826 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build netbsd && arm
-// +build netbsd,arm
package unix
@@ -1824,28 +1823,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) {
var _p0 *byte
_p0, err = BytePtrFromString(path)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm64.go
index 3b77da110..d576438bb 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build netbsd && arm64
-// +build netbsd,arm64
package unix
@@ -1824,28 +1823,6 @@ func munmap(addr uintptr, length uintptr) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) {
var _p0 *byte
_p0, err = BytePtrFromString(path)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go
index 9ab9abf72..9dc42410b 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build openbsd && 386
-// +build openbsd,386
package unix
@@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) {
return
}
+var libc_ioctl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
_, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg))
if e1 != 0 {
@@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
return
}
-var libc_ioctl_trampoline_addr uintptr
-
-//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
-
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
@@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func fcntl(fd int, cmd int, arg int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_fcntl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_fcntl fcntl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0)
n = int(r0)
@@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
+func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags))
n = int(r0)
if e1 != 0 {
err = errnoErr(e1)
@@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
return
}
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+var libc_getfsstat_trampoline_addr uintptr
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
+//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so"
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
@@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error
var libc_utimensat_trampoline_addr uintptr
//go:cgo_import_dynamic libc_utimensat utimensat "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func pledge(promises *byte, execpromises *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_pledge_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pledge pledge "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func unveil(path *byte, flags *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_unveil_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_unveil unveil "libc.so"
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s
index 3dcacd30d..41b561731 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s
@@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $4
DATA ·libc_sysctl_trampoline_addr(SB)/4, $libc_sysctl_trampoline<>(SB)
+TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_fcntl(SB)
+GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $4
+DATA ·libc_fcntl_trampoline_addr(SB)/4, $libc_fcntl_trampoline<>(SB)
+
TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ppoll(SB)
GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $4
@@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $4
DATA ·libc_munmap_trampoline_addr(SB)/4, $libc_munmap_trampoline<>(SB)
+TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_getfsstat(SB)
+GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $4
+DATA ·libc_getfsstat_trampoline_addr(SB)/4, $libc_getfsstat_trampoline<>(SB)
+
TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimensat(SB)
GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $4
DATA ·libc_utimensat_trampoline_addr(SB)/4, $libc_utimensat_trampoline<>(SB)
+
+TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_pledge(SB)
+GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $4
+DATA ·libc_pledge_trampoline_addr(SB)/4, $libc_pledge_trampoline<>(SB)
+
+TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_unveil(SB)
+GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $4
+DATA ·libc_unveil_trampoline_addr(SB)/4, $libc_unveil_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go
index 915761eab..0d3a0751c 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build openbsd && amd64
-// +build openbsd,amd64
package unix
@@ -585,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func fcntl(fd int, cmd int, arg int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_fcntl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_fcntl fcntl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0)
n = int(r0)
@@ -2213,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
+func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags))
n = int(r0)
if e1 != 0 {
err = errnoErr(e1)
@@ -2222,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
return
}
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+var libc_getfsstat_trampoline_addr uintptr
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
+//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so"
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
@@ -2251,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error
var libc_utimensat_trampoline_addr uintptr
//go:cgo_import_dynamic libc_utimensat utimensat "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func pledge(promises *byte, execpromises *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_pledge_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pledge pledge "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func unveil(path *byte, flags *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_unveil_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_unveil unveil "libc.so"
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s
index 2763620b0..4019a656f 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s
@@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
+TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_fcntl(SB)
+GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8
+DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB)
+
TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ppoll(SB)
GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $8
@@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8
DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB)
+TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_getfsstat(SB)
+GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8
+DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB)
+
TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimensat(SB)
GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8
DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB)
+
+TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_pledge(SB)
+GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8
+DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB)
+
+TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_unveil(SB)
+GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8
+DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go
index 8e87fdf15..c39f7776d 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build openbsd && arm
-// +build openbsd,arm
package unix
@@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) {
return
}
+var libc_ioctl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
_, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg))
if e1 != 0 {
@@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
return
}
-var libc_ioctl_trampoline_addr uintptr
-
-//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
-
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
@@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func fcntl(fd int, cmd int, arg int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_fcntl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_fcntl fcntl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0)
n = int(r0)
@@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
+func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags))
n = int(r0)
if e1 != 0 {
err = errnoErr(e1)
@@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
return
}
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+var libc_getfsstat_trampoline_addr uintptr
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
+//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so"
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
@@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error
var libc_utimensat_trampoline_addr uintptr
//go:cgo_import_dynamic libc_utimensat utimensat "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func pledge(promises *byte, execpromises *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_pledge_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pledge pledge "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func unveil(path *byte, flags *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_unveil_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_unveil unveil "libc.so"
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s
index c92231404..ac4af24f9 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s
@@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $4
DATA ·libc_sysctl_trampoline_addr(SB)/4, $libc_sysctl_trampoline<>(SB)
+TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_fcntl(SB)
+GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $4
+DATA ·libc_fcntl_trampoline_addr(SB)/4, $libc_fcntl_trampoline<>(SB)
+
TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ppoll(SB)
GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $4
@@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $4
DATA ·libc_munmap_trampoline_addr(SB)/4, $libc_munmap_trampoline<>(SB)
+TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_getfsstat(SB)
+GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $4
+DATA ·libc_getfsstat_trampoline_addr(SB)/4, $libc_getfsstat_trampoline<>(SB)
+
TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimensat(SB)
GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $4
DATA ·libc_utimensat_trampoline_addr(SB)/4, $libc_utimensat_trampoline<>(SB)
+
+TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_pledge(SB)
+GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $4
+DATA ·libc_pledge_trampoline_addr(SB)/4, $libc_pledge_trampoline<>(SB)
+
+TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_unveil(SB)
+GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $4
+DATA ·libc_unveil_trampoline_addr(SB)/4, $libc_unveil_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go
index 12a7a2160..57571d072 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build openbsd && arm64
-// +build openbsd,arm64
package unix
@@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) {
return
}
+var libc_ioctl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
_, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg))
if e1 != 0 {
@@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
return
}
-var libc_ioctl_trampoline_addr uintptr
-
-//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
-
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
@@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func fcntl(fd int, cmd int, arg int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_fcntl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_fcntl fcntl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0)
n = int(r0)
@@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
+func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags))
n = int(r0)
if e1 != 0 {
err = errnoErr(e1)
@@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
return
}
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+var libc_getfsstat_trampoline_addr uintptr
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
+//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so"
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
@@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error
var libc_utimensat_trampoline_addr uintptr
//go:cgo_import_dynamic libc_utimensat utimensat "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func pledge(promises *byte, execpromises *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_pledge_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pledge pledge "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func unveil(path *byte, flags *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_unveil_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_unveil unveil "libc.so"
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s
index a6bc32c92..f77d53212 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s
@@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
+TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_fcntl(SB)
+GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8
+DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB)
+
TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ppoll(SB)
GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $8
@@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8
DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB)
+TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_getfsstat(SB)
+GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8
+DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB)
+
TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimensat(SB)
GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8
DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB)
+
+TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_pledge(SB)
+GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8
+DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB)
+
+TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_unveil(SB)
+GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8
+DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go
index b19e8aa03..e62963e67 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build openbsd && mips64
-// +build openbsd,mips64
package unix
@@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) {
return
}
+var libc_ioctl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
_, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg))
if e1 != 0 {
@@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
return
}
-var libc_ioctl_trampoline_addr uintptr
-
-//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
-
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
@@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func fcntl(fd int, cmd int, arg int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_fcntl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_fcntl fcntl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0)
n = int(r0)
@@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
+func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags))
n = int(r0)
if e1 != 0 {
err = errnoErr(e1)
@@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
return
}
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+var libc_getfsstat_trampoline_addr uintptr
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
+//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so"
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
@@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error
var libc_utimensat_trampoline_addr uintptr
//go:cgo_import_dynamic libc_utimensat utimensat "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func pledge(promises *byte, execpromises *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_pledge_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pledge pledge "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func unveil(path *byte, flags *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_unveil_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_unveil unveil "libc.so"
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s
index b4e7bceab..fae140b62 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s
@@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
+TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_fcntl(SB)
+GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8
+DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB)
+
TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ppoll(SB)
GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $8
@@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8
DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB)
+TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_getfsstat(SB)
+GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8
+DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB)
+
TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimensat(SB)
GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8
DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB)
+
+TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_pledge(SB)
+GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8
+DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB)
+
+TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_unveil(SB)
+GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8
+DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go
index fb99594c9..00831354c 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build openbsd && ppc64
-// +build openbsd,ppc64
package unix
@@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) {
return
}
+var libc_ioctl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
_, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg))
if e1 != 0 {
@@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
return
}
-var libc_ioctl_trampoline_addr uintptr
-
-//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
-
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
@@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func fcntl(fd int, cmd int, arg int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_fcntl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_fcntl fcntl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0)
n = int(r0)
@@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
+func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags))
n = int(r0)
if e1 != 0 {
err = errnoErr(e1)
@@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
return
}
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+var libc_getfsstat_trampoline_addr uintptr
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
+//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so"
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
@@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error
var libc_utimensat_trampoline_addr uintptr
//go:cgo_import_dynamic libc_utimensat utimensat "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func pledge(promises *byte, execpromises *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_pledge_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pledge pledge "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func unveil(path *byte, flags *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_unveil_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_unveil unveil "libc.so"
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s
index ca3f76600..9d1e0ff06 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s
@@ -213,6 +213,12 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
+TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0
+ CALL libc_fcntl(SB)
+ RET
+GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8
+DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB)
+
TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0
CALL libc_ppoll(SB)
RET
@@ -801,8 +807,26 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8
DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB)
+TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0
+ CALL libc_getfsstat(SB)
+ RET
+GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8
+DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB)
+
TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0
CALL libc_utimensat(SB)
RET
GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8
DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB)
+
+TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0
+ CALL libc_pledge(SB)
+ RET
+GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8
+DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB)
+
+TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0
+ CALL libc_unveil(SB)
+ RET
+GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8
+DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go
index 32cbbbc52..79029ed58 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build openbsd && riscv64
-// +build openbsd,riscv64
package unix
@@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) {
return
}
+var libc_ioctl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
_, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg))
if e1 != 0 {
@@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
return
}
-var libc_ioctl_trampoline_addr uintptr
-
-//go:cgo_import_dynamic libc_ioctl ioctl "libc.so"
-
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
@@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func fcntl(fd int, cmd int, arg int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_fcntl_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_fcntl fcntl "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg))
+ n = int(r0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0)
n = int(r0)
@@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
+func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) {
+ r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags))
n = int(r0)
if e1 != 0 {
err = errnoErr(e1)
@@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
return
}
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+var libc_getfsstat_trampoline_addr uintptr
-func writelen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
+//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so"
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
@@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error
var libc_utimensat_trampoline_addr uintptr
//go:cgo_import_dynamic libc_utimensat utimensat "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func pledge(promises *byte, execpromises *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_pledge_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pledge pledge "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func unveil(path *byte, flags *byte) (err error) {
+ _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_unveil_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_unveil unveil "libc.so"
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s
index 477a7d5b2..da115f9a4 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s
@@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8
DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
+TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_fcntl(SB)
+GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8
+DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB)
+
TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_ppoll(SB)
GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $8
@@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8
DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB)
+TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_getfsstat(SB)
+GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8
+DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB)
+
TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_utimensat(SB)
GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8
DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB)
+
+TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_pledge(SB)
+GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8
+DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB)
+
+TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_unveil(SB)
+GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8
+DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go
index 609d1c598..829b87feb 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build solaris && amd64
-// +build solaris,amd64
package unix
@@ -436,7 +435,7 @@ func pipe(p *[2]_C_int) (n int, err error) {
r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procpipe)), 1, uintptr(unsafe.Pointer(p)), 0, 0, 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -446,7 +445,7 @@ func pipe(p *[2]_C_int) (n int, err error) {
func pipe2(p *[2]_C_int, flags int) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procpipe2)), 2, uintptr(unsafe.Pointer(p)), uintptr(flags), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -456,7 +455,7 @@ func pipe2(p *[2]_C_int, flags int) (err error) {
func getsockname(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procgetsockname)), 3, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -471,7 +470,7 @@ func Getcwd(buf []byte) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procGetcwd)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(buf)), 0, 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -482,7 +481,7 @@ func getgroups(ngid int, gid *_Gid_t) (n int, err error) {
r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procgetgroups)), 2, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0, 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -492,7 +491,7 @@ func getgroups(ngid int, gid *_Gid_t) (n int, err error) {
func setgroups(ngid int, gid *_Gid_t) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procsetgroups)), 2, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -503,7 +502,7 @@ func wait4(pid int32, statusp *_C_int, options int, rusage *Rusage) (wpid int32,
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procwait4)), 4, uintptr(pid), uintptr(unsafe.Pointer(statusp)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0)
wpid = int32(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -518,7 +517,7 @@ func gethostname(buf []byte) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procgethostname)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(buf)), 0, 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -533,7 +532,7 @@ func utimes(path string, times *[2]Timeval) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procutimes)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -548,7 +547,7 @@ func utimensat(fd int, path string, times *[2]Timespec, flag int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procutimensat)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flag), 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -559,7 +558,7 @@ func fcntl(fd int, cmd int, arg int) (val int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procfcntl)), 3, uintptr(fd), uintptr(cmd), uintptr(arg), 0, 0, 0)
val = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -569,7 +568,7 @@ func fcntl(fd int, cmd int, arg int) (val int, err error) {
func futimesat(fildes int, path *byte, times *[2]Timeval) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procfutimesat)), 3, uintptr(fildes), uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(times)), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -580,7 +579,7 @@ func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procaccept)), 3, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), 0, 0, 0)
fd = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -591,7 +590,7 @@ func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_recvmsg)), 3, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -602,7 +601,7 @@ func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_sendmsg)), 3, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -612,7 +611,7 @@ func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) {
func acct(path *byte) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procacct)), 1, uintptr(unsafe.Pointer(path)), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -647,7 +646,7 @@ func ioctlRet(fd int, req int, arg uintptr) (ret int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procioctl)), 3, uintptr(fd), uintptr(req), uintptr(arg), 0, 0, 0)
ret = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -658,7 +657,7 @@ func ioctlPtrRet(fd int, req int, arg unsafe.Pointer) (ret int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procioctl)), 3, uintptr(fd), uintptr(req), uintptr(arg), 0, 0, 0)
ret = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -669,7 +668,7 @@ func poll(fds *PollFd, nfds int, timeout int) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpoll)), 3, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -684,7 +683,7 @@ func Access(path string, mode uint32) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procAccess)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -694,7 +693,7 @@ func Access(path string, mode uint32) (err error) {
func Adjtime(delta *Timeval, olddelta *Timeval) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procAdjtime)), 2, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -709,7 +708,7 @@ func Chdir(path string) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procChdir)), 1, uintptr(unsafe.Pointer(_p0)), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -724,7 +723,7 @@ func Chmod(path string, mode uint32) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procChmod)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -739,7 +738,7 @@ func Chown(path string, uid int, gid int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procChown)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -754,7 +753,7 @@ func Chroot(path string) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procChroot)), 1, uintptr(unsafe.Pointer(_p0)), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -764,7 +763,7 @@ func Chroot(path string) (err error) {
func ClockGettime(clockid int32, time *Timespec) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procClockGettime)), 2, uintptr(clockid), uintptr(unsafe.Pointer(time)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -774,7 +773,7 @@ func ClockGettime(clockid int32, time *Timespec) (err error) {
func Close(fd int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procClose)), 1, uintptr(fd), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -790,7 +789,7 @@ func Creat(path string, mode uint32) (fd int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procCreat)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0)
fd = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -801,7 +800,7 @@ func Dup(fd int) (nfd int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procDup)), 1, uintptr(fd), 0, 0, 0, 0, 0)
nfd = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -811,7 +810,7 @@ func Dup(fd int) (nfd int, err error) {
func Dup2(oldfd int, newfd int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procDup2)), 2, uintptr(oldfd), uintptr(newfd), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -833,7 +832,7 @@ func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFaccessat)), 4, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -843,7 +842,7 @@ func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) {
func Fchdir(fd int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchdir)), 1, uintptr(fd), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -853,7 +852,7 @@ func Fchdir(fd int) (err error) {
func Fchmod(fd int, mode uint32) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchmod)), 2, uintptr(fd), uintptr(mode), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -868,7 +867,7 @@ func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchmodat)), 4, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -878,7 +877,7 @@ func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) {
func Fchown(fd int, uid int, gid int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchown)), 3, uintptr(fd), uintptr(uid), uintptr(gid), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -893,7 +892,7 @@ func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchownat)), 5, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -903,7 +902,7 @@ func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) {
func Fdatasync(fd int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFdatasync)), 1, uintptr(fd), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -913,7 +912,7 @@ func Fdatasync(fd int) (err error) {
func Flock(fd int, how int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFlock)), 2, uintptr(fd), uintptr(how), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -924,7 +923,7 @@ func Fpathconf(fd int, name int) (val int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFpathconf)), 2, uintptr(fd), uintptr(name), 0, 0, 0, 0)
val = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -934,7 +933,7 @@ func Fpathconf(fd int, name int) (val int, err error) {
func Fstat(fd int, stat *Stat_t) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFstat)), 2, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -949,7 +948,7 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFstatat)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -959,7 +958,7 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) {
func Fstatvfs(fd int, vfsstat *Statvfs_t) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFstatvfs)), 2, uintptr(fd), uintptr(unsafe.Pointer(vfsstat)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -974,7 +973,7 @@ func Getdents(fd int, buf []byte, basep *uintptr) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procGetdents)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(buf)), uintptr(unsafe.Pointer(basep)), 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1001,7 +1000,7 @@ func Getpgid(pid int) (pgid int, err error) {
r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetpgid)), 1, uintptr(pid), 0, 0, 0, 0, 0)
pgid = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1012,7 +1011,7 @@ func Getpgrp() (pgid int, err error) {
r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetpgrp)), 0, 0, 0, 0, 0, 0, 0)
pgid = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1047,7 +1046,7 @@ func Getpriority(which int, who int) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procGetpriority)), 2, uintptr(which), uintptr(who), 0, 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1057,7 +1056,7 @@ func Getpriority(which int, who int) (n int, err error) {
func Getrlimit(which int, lim *Rlimit) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetrlimit)), 2, uintptr(which), uintptr(unsafe.Pointer(lim)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1067,7 +1066,7 @@ func Getrlimit(which int, lim *Rlimit) (err error) {
func Getrusage(who int, rusage *Rusage) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetrusage)), 2, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1078,7 +1077,7 @@ func Getsid(pid int) (sid int, err error) {
r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetsid)), 1, uintptr(pid), 0, 0, 0, 0, 0)
sid = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1088,7 +1087,7 @@ func Getsid(pid int) (sid int, err error) {
func Gettimeofday(tv *Timeval) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGettimeofday)), 1, uintptr(unsafe.Pointer(tv)), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1106,7 +1105,7 @@ func Getuid() (uid int) {
func Kill(pid int, signum syscall.Signal) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procKill)), 2, uintptr(pid), uintptr(signum), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1121,7 +1120,7 @@ func Lchown(path string, uid int, gid int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procLchown)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1141,7 +1140,7 @@ func Link(path string, link string) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procLink)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1151,7 +1150,7 @@ func Link(path string, link string) (err error) {
func Listen(s int, backlog int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_llisten)), 2, uintptr(s), uintptr(backlog), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1166,7 +1165,7 @@ func Lstat(path string, stat *Stat_t) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procLstat)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1180,7 +1179,7 @@ func Madvise(b []byte, advice int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMadvise)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), uintptr(advice), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1195,7 +1194,7 @@ func Mkdir(path string, mode uint32) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMkdir)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1210,7 +1209,7 @@ func Mkdirat(dirfd int, path string, mode uint32) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMkdirat)), 3, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1225,7 +1224,7 @@ func Mkfifo(path string, mode uint32) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMkfifo)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1240,7 +1239,7 @@ func Mkfifoat(dirfd int, path string, mode uint32) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMkfifoat)), 3, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1255,7 +1254,7 @@ func Mknod(path string, mode uint32, dev int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMknod)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1270,7 +1269,7 @@ func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMknodat)), 4, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1284,7 +1283,7 @@ func Mlock(b []byte) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMlock)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1294,7 +1293,7 @@ func Mlock(b []byte) (err error) {
func Mlockall(flags int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMlockall)), 1, uintptr(flags), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1308,7 +1307,7 @@ func Mprotect(b []byte, prot int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMprotect)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), uintptr(prot), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1322,7 +1321,7 @@ func Msync(b []byte, flags int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMsync)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), uintptr(flags), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1336,7 +1335,7 @@ func Munlock(b []byte) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMunlock)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1346,7 +1345,7 @@ func Munlock(b []byte) (err error) {
func Munlockall() (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMunlockall)), 0, 0, 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1356,7 +1355,7 @@ func Munlockall() (err error) {
func Nanosleep(time *Timespec, leftover *Timespec) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procNanosleep)), 2, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1372,7 +1371,7 @@ func Open(path string, mode int, perm uint32) (fd int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procOpen)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0, 0)
fd = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1388,7 +1387,7 @@ func Openat(dirfd int, path string, flags int, mode uint32) (fd int, err error)
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procOpenat)), 4, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags), uintptr(mode), 0, 0)
fd = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1404,7 +1403,7 @@ func Pathconf(path string, name int) (val int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procPathconf)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0, 0, 0, 0)
val = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1414,7 +1413,7 @@ func Pathconf(path string, name int) (val int, err error) {
func Pause() (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procPause)), 0, 0, 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1429,7 +1428,7 @@ func pread(fd int, p []byte, offset int64) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpread)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), uintptr(offset), 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1444,7 +1443,7 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpwrite)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), uintptr(offset), 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1459,7 +1458,7 @@ func read(fd int, p []byte) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procread)), 3, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1479,7 +1478,7 @@ func Readlink(path string, buf []byte) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procReadlink)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(len(buf)), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1499,7 +1498,7 @@ func Rename(from string, to string) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procRename)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1519,7 +1518,7 @@ func Renameat(olddirfd int, oldpath string, newdirfd int, newpath string) (err e
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procRenameat)), 4, uintptr(olddirfd), uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1)), 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1534,7 +1533,7 @@ func Rmdir(path string) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procRmdir)), 1, uintptr(unsafe.Pointer(_p0)), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1545,7 +1544,7 @@ func Seek(fd int, offset int64, whence int) (newoffset int64, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proclseek)), 3, uintptr(fd), uintptr(offset), uintptr(whence), 0, 0, 0)
newoffset = int64(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1556,7 +1555,7 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSelect)), 5, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1566,7 +1565,7 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err
func Setegid(egid int) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetegid)), 1, uintptr(egid), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1576,7 +1575,7 @@ func Setegid(egid int) (err error) {
func Seteuid(euid int) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSeteuid)), 1, uintptr(euid), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1586,7 +1585,7 @@ func Seteuid(euid int) (err error) {
func Setgid(gid int) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetgid)), 1, uintptr(gid), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1600,7 +1599,7 @@ func Sethostname(p []byte) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSethostname)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1610,7 +1609,7 @@ func Sethostname(p []byte) (err error) {
func Setpgid(pid int, pgid int) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetpgid)), 2, uintptr(pid), uintptr(pgid), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1620,7 +1619,7 @@ func Setpgid(pid int, pgid int) (err error) {
func Setpriority(which int, who int, prio int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSetpriority)), 3, uintptr(which), uintptr(who), uintptr(prio), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1630,7 +1629,7 @@ func Setpriority(which int, who int, prio int) (err error) {
func Setregid(rgid int, egid int) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetregid)), 2, uintptr(rgid), uintptr(egid), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1640,7 +1639,7 @@ func Setregid(rgid int, egid int) (err error) {
func Setreuid(ruid int, euid int) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetreuid)), 2, uintptr(ruid), uintptr(euid), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1651,7 +1650,7 @@ func Setsid() (pid int, err error) {
r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetsid)), 0, 0, 0, 0, 0, 0, 0)
pid = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1661,7 +1660,7 @@ func Setsid() (pid int, err error) {
func Setuid(uid int) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetuid)), 1, uintptr(uid), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1671,7 +1670,7 @@ func Setuid(uid int) (err error) {
func Shutdown(s int, how int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procshutdown)), 2, uintptr(s), uintptr(how), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1686,7 +1685,7 @@ func Stat(path string, stat *Stat_t) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procStat)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1701,7 +1700,7 @@ func Statvfs(path string, vfsstat *Statvfs_t) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procStatvfs)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(vfsstat)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1721,7 +1720,7 @@ func Symlink(path string, link string) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSymlink)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1731,7 +1730,7 @@ func Symlink(path string, link string) (err error) {
func Sync() (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSync)), 0, 0, 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1742,7 +1741,7 @@ func Sysconf(which int) (n int64, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSysconf)), 1, uintptr(which), 0, 0, 0, 0, 0)
n = int64(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1753,7 +1752,7 @@ func Times(tms *Tms) (ticks uintptr, err error) {
r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procTimes)), 1, uintptr(unsafe.Pointer(tms)), 0, 0, 0, 0, 0)
ticks = uintptr(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1768,7 +1767,7 @@ func Truncate(path string, length int64) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procTruncate)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(length), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1778,7 +1777,7 @@ func Truncate(path string, length int64) (err error) {
func Fsync(fd int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFsync)), 1, uintptr(fd), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1788,7 +1787,7 @@ func Fsync(fd int) (err error) {
func Ftruncate(fd int, length int64) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFtruncate)), 2, uintptr(fd), uintptr(length), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1806,7 +1805,7 @@ func Umask(mask int) (oldmask int) {
func Uname(buf *Utsname) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procUname)), 1, uintptr(unsafe.Pointer(buf)), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1821,7 +1820,7 @@ func Unmount(target string, flags int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procumount)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1836,7 +1835,7 @@ func Unlink(path string) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procUnlink)), 1, uintptr(unsafe.Pointer(_p0)), 0, 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1851,7 +1850,7 @@ func Unlinkat(dirfd int, path string, flags int) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procUnlinkat)), 3, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1861,7 +1860,7 @@ func Unlinkat(dirfd int, path string, flags int) (err error) {
func Ustat(dev int, ubuf *Ustat_t) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procUstat)), 2, uintptr(dev), uintptr(unsafe.Pointer(ubuf)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1876,7 +1875,7 @@ func Utime(path string, buf *Utimbuf) (err error) {
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procUtime)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(buf)), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1886,7 +1885,7 @@ func Utime(path string, buf *Utimbuf) (err error) {
func bind(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_bind)), 3, uintptr(s), uintptr(addr), uintptr(addrlen), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1896,7 +1895,7 @@ func bind(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) {
func connect(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_connect)), 3, uintptr(s), uintptr(addr), uintptr(addrlen), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1907,7 +1906,7 @@ func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procmmap)), 6, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), uintptr(pos))
ret = uintptr(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1917,7 +1916,7 @@ func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (
func munmap(addr uintptr, length uintptr) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procmunmap)), 2, uintptr(addr), uintptr(length), 0, 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1928,7 +1927,7 @@ func sendfile(outfd int, infd int, offset *int64, count int) (written int, err e
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procsendfile)), 4, uintptr(outfd), uintptr(infd), uintptr(unsafe.Pointer(offset)), uintptr(count), 0, 0)
written = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1942,7 +1941,7 @@ func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) (
}
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_sendto)), 6, uintptr(s), uintptr(unsafe.Pointer(_p0)), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen))
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1953,7 +1952,7 @@ func socket(domain int, typ int, proto int) (fd int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_socket)), 3, uintptr(domain), uintptr(typ), uintptr(proto), 0, 0, 0)
fd = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1963,7 +1962,7 @@ func socket(domain int, typ int, proto int) (fd int, err error) {
func socketpair(domain int, typ int, proto int, fd *[2]int32) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&proc__xnet_socketpair)), 4, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1978,7 +1977,7 @@ func write(fd int, p []byte) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procwrite)), 3, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1988,7 +1987,7 @@ func write(fd int, p []byte) (n int, err error) {
func getsockopt(s int, level int, name int, val unsafe.Pointer, vallen *_Socklen) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_getsockopt)), 5, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -1998,7 +1997,7 @@ func getsockopt(s int, level int, name int, val unsafe.Pointer, vallen *_Socklen
func getpeername(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) {
_, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procgetpeername)), 3, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), 0, 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -2008,7 +2007,7 @@ func getpeername(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) {
func setsockopt(s int, level int, name int, val unsafe.Pointer, vallen uintptr) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procsetsockopt)), 5, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -2023,7 +2022,7 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procrecvfrom)), 6, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen)))
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -2034,7 +2033,7 @@ func port_create() (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_create)), 0, 0, 0, 0, 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -2045,7 +2044,7 @@ func port_associate(port int, source int, object uintptr, events int, user *byte
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_associate)), 5, uintptr(port), uintptr(source), uintptr(object), uintptr(events), uintptr(unsafe.Pointer(user)), 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -2056,7 +2055,7 @@ func port_dissociate(port int, source int, object uintptr) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_dissociate)), 3, uintptr(port), uintptr(source), uintptr(object), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -2067,7 +2066,7 @@ func port_get(port int, pe *portEvent, timeout *Timespec) (n int, err error) {
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_get)), 3, uintptr(port), uintptr(unsafe.Pointer(pe)), uintptr(unsafe.Pointer(timeout)), 0, 0, 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -2078,7 +2077,7 @@ func port_getn(port int, pe *portEvent, max uint32, nget *uint32, timeout *Times
r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_getn)), 5, uintptr(port), uintptr(unsafe.Pointer(pe)), uintptr(max), uintptr(unsafe.Pointer(nget)), uintptr(unsafe.Pointer(timeout)), 0)
n = int(r0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -2088,7 +2087,7 @@ func port_getn(port int, pe *portEvent, max uint32, nget *uint32, timeout *Times
func putmsg(fd int, clptr *strbuf, dataptr *strbuf, flags int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procputmsg)), 4, uintptr(fd), uintptr(unsafe.Pointer(clptr)), uintptr(unsafe.Pointer(dataptr)), uintptr(flags), 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
@@ -2098,7 +2097,7 @@ func putmsg(fd int, clptr *strbuf, dataptr *strbuf, flags int) (err error) {
func getmsg(fd int, clptr *strbuf, dataptr *strbuf, flags *int) (err error) {
_, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procgetmsg)), 4, uintptr(fd), uintptr(unsafe.Pointer(clptr)), uintptr(unsafe.Pointer(dataptr)), uintptr(unsafe.Pointer(flags)), 0, 0)
if e1 != 0 {
- err = e1
+ err = errnoErr(e1)
}
return
}
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_zos_s390x.go b/vendor/golang.org/x/sys/unix/zsyscall_zos_s390x.go
index c31681743..94f011238 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_zos_s390x.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build zos && s390x
-// +build zos,s390x
package unix
@@ -40,17 +39,6 @@ func read(fd int, p []byte) (n int, err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-func readlen(fd int, buf *byte, nbuf int) (n int, err error) {
- r0, _, e1 := syscall_syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf))
- n = int(r0)
- if e1 != 0 {
- err = errnoErr(e1)
- }
- return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
func write(fd int, p []byte) (n int, err error) {
var _p0 unsafe.Pointer
if len(p) > 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_386.go
index 55e048471..3a58ae819 100644
--- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; DO NOT EDIT.
//go:build 386 && openbsd
-// +build 386,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_amd64.go
index d2243cf83..dcb7a0eb7 100644
--- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; DO NOT EDIT.
//go:build amd64 && openbsd
-// +build amd64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm.go
index 82dc51bd8..db5a7bf13 100644
--- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; DO NOT EDIT.
//go:build arm && openbsd
-// +build arm,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm64.go
index cbdda1a4a..7be575a77 100644
--- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; DO NOT EDIT.
//go:build arm64 && openbsd
-// +build arm64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_mips64.go
index f55eae1a8..d6e3174c6 100644
--- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_mips64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; DO NOT EDIT.
//go:build mips64 && openbsd
-// +build mips64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_ppc64.go
index e44054470..ee97157d0 100644
--- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; DO NOT EDIT.
//go:build ppc64 && openbsd
-// +build ppc64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_riscv64.go
index a0db82fce..35c3b91d0 100644
--- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; DO NOT EDIT.
//go:build riscv64 && openbsd
-// +build riscv64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_darwin_amd64.go
index f8298ff9b..5edda7687 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_darwin_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && darwin
-// +build amd64,darwin
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_darwin_arm64.go
index 5eb433bbf..0dc9e8b4d 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_darwin_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && darwin
-// +build arm64,darwin
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_dragonfly_amd64.go
index 703675c0c..308ddf3a1 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_dragonfly_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_dragonfly_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && dragonfly
-// +build amd64,dragonfly
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go
index 4e0d96107..418664e3d 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && freebsd
-// +build 386,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go
index 01636b838..34d0b86d7 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && freebsd
-// +build amd64,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go
index ad99bc106..b71cf45e2 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && freebsd
-// +build arm,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go
index 89dcc4274..e32df1c1e 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && freebsd
-// +build arm64,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_riscv64.go
index ee37aaa0c..15ad6111f 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build riscv64 && freebsd
-// +build riscv64,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go
index c9c4ad031..0cc3ce496 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && linux
-// +build 386,linux
package unix
@@ -447,4 +446,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
index 12ff3417c..856d92d69 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && linux
-// +build amd64,linux
package unix
@@ -369,4 +368,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go
index c3fb5e77a..8d467094c 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && linux
-// +build arm,linux
package unix
@@ -411,4 +410,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
index 358c847a4..edc173244 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && linux
-// +build arm64,linux
package unix
@@ -314,4 +313,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go
index 81c4849b1..445eba206 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build loong64 && linux
-// +build loong64,linux
package unix
@@ -308,4 +307,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go
index 202a57e90..adba01bca 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips && linux
-// +build mips,linux
package unix
@@ -431,4 +430,10 @@ const (
SYS_PROCESS_MRELEASE = 4448
SYS_FUTEX_WAITV = 4449
SYS_SET_MEMPOLICY_HOME_NODE = 4450
+ SYS_CACHESTAT = 4451
+ SYS_FCHMODAT2 = 4452
+ SYS_MAP_SHADOW_STACK = 4453
+ SYS_FUTEX_WAKE = 4454
+ SYS_FUTEX_WAIT = 4455
+ SYS_FUTEX_REQUEUE = 4456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go
index 1fbceb52d..014c4e9c7 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips64 && linux
-// +build mips64,linux
package unix
@@ -361,4 +360,10 @@ const (
SYS_PROCESS_MRELEASE = 5448
SYS_FUTEX_WAITV = 5449
SYS_SET_MEMPOLICY_HOME_NODE = 5450
+ SYS_CACHESTAT = 5451
+ SYS_FCHMODAT2 = 5452
+ SYS_MAP_SHADOW_STACK = 5453
+ SYS_FUTEX_WAKE = 5454
+ SYS_FUTEX_WAIT = 5455
+ SYS_FUTEX_REQUEUE = 5456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go
index b4ffb7a20..ccc97d74d 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips64le && linux
-// +build mips64le,linux
package unix
@@ -361,4 +360,10 @@ const (
SYS_PROCESS_MRELEASE = 5448
SYS_FUTEX_WAITV = 5449
SYS_SET_MEMPOLICY_HOME_NODE = 5450
+ SYS_CACHESTAT = 5451
+ SYS_FCHMODAT2 = 5452
+ SYS_MAP_SHADOW_STACK = 5453
+ SYS_FUTEX_WAKE = 5454
+ SYS_FUTEX_WAIT = 5455
+ SYS_FUTEX_REQUEUE = 5456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go
index 867985f9b..ec2b64a95 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mipsle && linux
-// +build mipsle,linux
package unix
@@ -431,4 +430,10 @@ const (
SYS_PROCESS_MRELEASE = 4448
SYS_FUTEX_WAITV = 4449
SYS_SET_MEMPOLICY_HOME_NODE = 4450
+ SYS_CACHESTAT = 4451
+ SYS_FCHMODAT2 = 4452
+ SYS_MAP_SHADOW_STACK = 4453
+ SYS_FUTEX_WAKE = 4454
+ SYS_FUTEX_WAIT = 4455
+ SYS_FUTEX_REQUEUE = 4456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go
index a8cce69ed..21a839e33 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc && linux
-// +build ppc,linux
package unix
@@ -438,4 +437,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go
index d44c5b39d..c11121ec3 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64 && linux
-// +build ppc64,linux
package unix
@@ -410,4 +409,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go
index 4214dd9c0..909b631fc 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64le && linux
-// +build ppc64le,linux
package unix
@@ -410,4 +409,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go
index ef285c567..e49bed16e 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build riscv64 && linux
-// +build riscv64,linux
package unix
@@ -315,4 +314,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go
index e6ed7d637..66017d2d3 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build s390x && linux
-// +build s390x,linux
package unix
@@ -376,4 +375,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go
index 92f628ef4..47bab18dc 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build sparc64 && linux
-// +build sparc64,linux
package unix
@@ -389,4 +388,10 @@ const (
SYS_PROCESS_MRELEASE = 448
SYS_FUTEX_WAITV = 449
SYS_SET_MEMPOLICY_HOME_NODE = 450
+ SYS_CACHESTAT = 451
+ SYS_FCHMODAT2 = 452
+ SYS_MAP_SHADOW_STACK = 453
+ SYS_FUTEX_WAKE = 454
+ SYS_FUTEX_WAIT = 455
+ SYS_FUTEX_REQUEUE = 456
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_386.go b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_386.go
index 3a6699eba..b2aa8cd49 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && netbsd
-// +build 386,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_amd64.go
index 5677cd4f1..524a1b1c9 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && netbsd
-// +build amd64,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm.go
index e784cb6db..d59b943ac 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && netbsd
-// +build arm,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm64.go
index bd4952efa..31e771d53 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; DO NOT EDIT.
//go:build arm64 && netbsd
-// +build arm64,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go
index 597733813..9fd77c6cb 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && openbsd
-// +build 386,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go
index 16af29189..af10af28c 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && openbsd
-// +build amd64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go
index f59b18a97..cc2028af4 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && openbsd
-// +build arm,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go
index 721ef5910..c06dd4415 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && openbsd
-// +build arm64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_mips64.go
index 01c43a01f..9ddbf3e08 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_mips64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips64 && openbsd
-// +build mips64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_ppc64.go
index f258cfa24..19a6ee413 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64 && openbsd
-// +build ppc64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_riscv64.go
index 07919e0ec..05192a782 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build riscv64 && openbsd
-// +build riscv64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_zos_s390x.go b/vendor/golang.org/x/sys/unix/zsysnum_zos_s390x.go
index 073daad43..b2e308581 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_zos_s390x.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build zos && s390x
-// +build zos,s390x
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_aix_ppc.go b/vendor/golang.org/x/sys/unix/ztypes_aix_ppc.go
index 7a8161c1d..3e6d57cae 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_aix_ppc.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_aix_ppc.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc && aix
-// +build ppc,aix
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_aix_ppc64.go b/vendor/golang.org/x/sys/unix/ztypes_aix_ppc64.go
index 07ed733c5..3a219bdce 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_aix_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_aix_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64 && aix
-// +build ppc64,aix
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
index 690cefc3d..091d107f3 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && darwin
-// +build amd64,darwin
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
index 5bffc10ea..28ff4ef74 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && darwin
-// +build arm64,darwin
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_dragonfly_amd64.go
index d0ba8e9b8..30e405bb4 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_dragonfly_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_dragonfly_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && dragonfly
-// +build amd64,dragonfly
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
index 29dc48337..6cbd094a3 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && freebsd
-// +build 386,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
index 0a89b2890..7c03b6ee7 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && freebsd
-// +build amd64,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
index c8666bb15..422107ee8 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && freebsd
-// +build arm,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
index 88fb48a88..505a12acf 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && freebsd
-// +build arm64,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
index 698dc975e..cc986c790 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build riscv64 && freebsd
-// +build riscv64,freebsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go
index 26ef52aaf..eff6bcdef 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go
@@ -1,7 +1,6 @@
// Code generated by mkmerge; DO NOT EDIT.
//go:build linux
-// +build linux
package unix
@@ -175,7 +174,8 @@ type FscryptPolicyV2 struct {
Contents_encryption_mode uint8
Filenames_encryption_mode uint8
Flags uint8
- _ [4]uint8
+ Log2_data_unit_size uint8
+ _ [3]uint8
Master_key_identifier [16]uint8
}
@@ -456,60 +456,63 @@ type Ucred struct {
}
type TCPInfo struct {
- State uint8
- Ca_state uint8
- Retransmits uint8
- Probes uint8
- Backoff uint8
- Options uint8
- Rto uint32
- Ato uint32
- Snd_mss uint32
- Rcv_mss uint32
- Unacked uint32
- Sacked uint32
- Lost uint32
- Retrans uint32
- Fackets uint32
- Last_data_sent uint32
- Last_ack_sent uint32
- Last_data_recv uint32
- Last_ack_recv uint32
- Pmtu uint32
- Rcv_ssthresh uint32
- Rtt uint32
- Rttvar uint32
- Snd_ssthresh uint32
- Snd_cwnd uint32
- Advmss uint32
- Reordering uint32
- Rcv_rtt uint32
- Rcv_space uint32
- Total_retrans uint32
- Pacing_rate uint64
- Max_pacing_rate uint64
- Bytes_acked uint64
- Bytes_received uint64
- Segs_out uint32
- Segs_in uint32
- Notsent_bytes uint32
- Min_rtt uint32
- Data_segs_in uint32
- Data_segs_out uint32
- Delivery_rate uint64
- Busy_time uint64
- Rwnd_limited uint64
- Sndbuf_limited uint64
- Delivered uint32
- Delivered_ce uint32
- Bytes_sent uint64
- Bytes_retrans uint64
- Dsack_dups uint32
- Reord_seen uint32
- Rcv_ooopack uint32
- Snd_wnd uint32
- Rcv_wnd uint32
- Rehash uint32
+ State uint8
+ Ca_state uint8
+ Retransmits uint8
+ Probes uint8
+ Backoff uint8
+ Options uint8
+ Rto uint32
+ Ato uint32
+ Snd_mss uint32
+ Rcv_mss uint32
+ Unacked uint32
+ Sacked uint32
+ Lost uint32
+ Retrans uint32
+ Fackets uint32
+ Last_data_sent uint32
+ Last_ack_sent uint32
+ Last_data_recv uint32
+ Last_ack_recv uint32
+ Pmtu uint32
+ Rcv_ssthresh uint32
+ Rtt uint32
+ Rttvar uint32
+ Snd_ssthresh uint32
+ Snd_cwnd uint32
+ Advmss uint32
+ Reordering uint32
+ Rcv_rtt uint32
+ Rcv_space uint32
+ Total_retrans uint32
+ Pacing_rate uint64
+ Max_pacing_rate uint64
+ Bytes_acked uint64
+ Bytes_received uint64
+ Segs_out uint32
+ Segs_in uint32
+ Notsent_bytes uint32
+ Min_rtt uint32
+ Data_segs_in uint32
+ Data_segs_out uint32
+ Delivery_rate uint64
+ Busy_time uint64
+ Rwnd_limited uint64
+ Sndbuf_limited uint64
+ Delivered uint32
+ Delivered_ce uint32
+ Bytes_sent uint64
+ Bytes_retrans uint64
+ Dsack_dups uint32
+ Reord_seen uint32
+ Rcv_ooopack uint32
+ Snd_wnd uint32
+ Rcv_wnd uint32
+ Rehash uint32
+ Total_rto uint16
+ Total_rto_recoveries uint16
+ Total_rto_time uint32
}
type CanFilter struct {
@@ -552,7 +555,7 @@ const (
SizeofIPv6MTUInfo = 0x20
SizeofICMPv6Filter = 0x20
SizeofUcred = 0xc
- SizeofTCPInfo = 0xf0
+ SizeofTCPInfo = 0xf8
SizeofCanFilter = 0x8
SizeofTCPRepairOpt = 0x8
)
@@ -833,6 +836,15 @@ const (
FSPICK_EMPTY_PATH = 0x8
FSMOUNT_CLOEXEC = 0x1
+
+ FSCONFIG_SET_FLAG = 0x0
+ FSCONFIG_SET_STRING = 0x1
+ FSCONFIG_SET_BINARY = 0x2
+ FSCONFIG_SET_PATH = 0x3
+ FSCONFIG_SET_PATH_EMPTY = 0x4
+ FSCONFIG_SET_FD = 0x5
+ FSCONFIG_CMD_CREATE = 0x6
+ FSCONFIG_CMD_RECONFIGURE = 0x7
)
type OpenHow struct {
@@ -1547,6 +1559,7 @@ const (
IFLA_DEVLINK_PORT = 0x3e
IFLA_GSO_IPV4_MAX_SIZE = 0x3f
IFLA_GRO_IPV4_MAX_SIZE = 0x40
+ IFLA_DPLL_PIN = 0x41
IFLA_PROTO_DOWN_REASON_UNSPEC = 0x0
IFLA_PROTO_DOWN_REASON_MASK = 0x1
IFLA_PROTO_DOWN_REASON_VALUE = 0x2
@@ -1562,6 +1575,7 @@ const (
IFLA_INET6_ICMP6STATS = 0x6
IFLA_INET6_TOKEN = 0x7
IFLA_INET6_ADDR_GEN_MODE = 0x8
+ IFLA_INET6_RA_MTU = 0x9
IFLA_BR_UNSPEC = 0x0
IFLA_BR_FORWARD_DELAY = 0x1
IFLA_BR_HELLO_TIME = 0x2
@@ -1609,6 +1623,9 @@ const (
IFLA_BR_MCAST_MLD_VERSION = 0x2c
IFLA_BR_VLAN_STATS_PER_PORT = 0x2d
IFLA_BR_MULTI_BOOLOPT = 0x2e
+ IFLA_BR_MCAST_QUERIER_STATE = 0x2f
+ IFLA_BR_FDB_N_LEARNED = 0x30
+ IFLA_BR_FDB_MAX_LEARNED = 0x31
IFLA_BRPORT_UNSPEC = 0x0
IFLA_BRPORT_STATE = 0x1
IFLA_BRPORT_PRIORITY = 0x2
@@ -1646,6 +1663,14 @@ const (
IFLA_BRPORT_BACKUP_PORT = 0x22
IFLA_BRPORT_MRP_RING_OPEN = 0x23
IFLA_BRPORT_MRP_IN_OPEN = 0x24
+ IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT = 0x25
+ IFLA_BRPORT_MCAST_EHT_HOSTS_CNT = 0x26
+ IFLA_BRPORT_LOCKED = 0x27
+ IFLA_BRPORT_MAB = 0x28
+ IFLA_BRPORT_MCAST_N_GROUPS = 0x29
+ IFLA_BRPORT_MCAST_MAX_GROUPS = 0x2a
+ IFLA_BRPORT_NEIGH_VLAN_SUPPRESS = 0x2b
+ IFLA_BRPORT_BACKUP_NHID = 0x2c
IFLA_INFO_UNSPEC = 0x0
IFLA_INFO_KIND = 0x1
IFLA_INFO_DATA = 0x2
@@ -1667,6 +1692,9 @@ const (
IFLA_MACVLAN_MACADDR = 0x4
IFLA_MACVLAN_MACADDR_DATA = 0x5
IFLA_MACVLAN_MACADDR_COUNT = 0x6
+ IFLA_MACVLAN_BC_QUEUE_LEN = 0x7
+ IFLA_MACVLAN_BC_QUEUE_LEN_USED = 0x8
+ IFLA_MACVLAN_BC_CUTOFF = 0x9
IFLA_VRF_UNSPEC = 0x0
IFLA_VRF_TABLE = 0x1
IFLA_VRF_PORT_UNSPEC = 0x0
@@ -1690,9 +1718,22 @@ const (
IFLA_XFRM_UNSPEC = 0x0
IFLA_XFRM_LINK = 0x1
IFLA_XFRM_IF_ID = 0x2
+ IFLA_XFRM_COLLECT_METADATA = 0x3
IFLA_IPVLAN_UNSPEC = 0x0
IFLA_IPVLAN_MODE = 0x1
IFLA_IPVLAN_FLAGS = 0x2
+ NETKIT_NEXT = -0x1
+ NETKIT_PASS = 0x0
+ NETKIT_DROP = 0x2
+ NETKIT_REDIRECT = 0x7
+ NETKIT_L2 = 0x0
+ NETKIT_L3 = 0x1
+ IFLA_NETKIT_UNSPEC = 0x0
+ IFLA_NETKIT_PEER_INFO = 0x1
+ IFLA_NETKIT_PRIMARY = 0x2
+ IFLA_NETKIT_POLICY = 0x3
+ IFLA_NETKIT_PEER_POLICY = 0x4
+ IFLA_NETKIT_MODE = 0x5
IFLA_VXLAN_UNSPEC = 0x0
IFLA_VXLAN_ID = 0x1
IFLA_VXLAN_GROUP = 0x2
@@ -1723,6 +1764,8 @@ const (
IFLA_VXLAN_GPE = 0x1b
IFLA_VXLAN_TTL_INHERIT = 0x1c
IFLA_VXLAN_DF = 0x1d
+ IFLA_VXLAN_VNIFILTER = 0x1e
+ IFLA_VXLAN_LOCALBYPASS = 0x1f
IFLA_GENEVE_UNSPEC = 0x0
IFLA_GENEVE_ID = 0x1
IFLA_GENEVE_REMOTE = 0x2
@@ -1737,6 +1780,7 @@ const (
IFLA_GENEVE_LABEL = 0xb
IFLA_GENEVE_TTL_INHERIT = 0xc
IFLA_GENEVE_DF = 0xd
+ IFLA_GENEVE_INNER_PROTO_INHERIT = 0xe
IFLA_BAREUDP_UNSPEC = 0x0
IFLA_BAREUDP_PORT = 0x1
IFLA_BAREUDP_ETHERTYPE = 0x2
@@ -1749,6 +1793,8 @@ const (
IFLA_GTP_FD1 = 0x2
IFLA_GTP_PDP_HASHSIZE = 0x3
IFLA_GTP_ROLE = 0x4
+ IFLA_GTP_CREATE_SOCKETS = 0x5
+ IFLA_GTP_RESTART_COUNT = 0x6
IFLA_BOND_UNSPEC = 0x0
IFLA_BOND_MODE = 0x1
IFLA_BOND_ACTIVE_SLAVE = 0x2
@@ -1778,6 +1824,9 @@ const (
IFLA_BOND_AD_ACTOR_SYSTEM = 0x1a
IFLA_BOND_TLB_DYNAMIC_LB = 0x1b
IFLA_BOND_PEER_NOTIF_DELAY = 0x1c
+ IFLA_BOND_AD_LACP_ACTIVE = 0x1d
+ IFLA_BOND_MISSED_MAX = 0x1e
+ IFLA_BOND_NS_IP6_TARGET = 0x1f
IFLA_BOND_AD_INFO_UNSPEC = 0x0
IFLA_BOND_AD_INFO_AGGREGATOR = 0x1
IFLA_BOND_AD_INFO_NUM_PORTS = 0x2
@@ -1793,6 +1842,7 @@ const (
IFLA_BOND_SLAVE_AD_AGGREGATOR_ID = 0x6
IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE = 0x7
IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE = 0x8
+ IFLA_BOND_SLAVE_PRIO = 0x9
IFLA_VF_INFO_UNSPEC = 0x0
IFLA_VF_INFO = 0x1
IFLA_VF_UNSPEC = 0x0
@@ -1851,8 +1901,16 @@ const (
IFLA_STATS_LINK_XSTATS_SLAVE = 0x3
IFLA_STATS_LINK_OFFLOAD_XSTATS = 0x4
IFLA_STATS_AF_SPEC = 0x5
+ IFLA_STATS_GETSET_UNSPEC = 0x0
+ IFLA_STATS_GET_FILTERS = 0x1
+ IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS = 0x2
IFLA_OFFLOAD_XSTATS_UNSPEC = 0x0
IFLA_OFFLOAD_XSTATS_CPU_HIT = 0x1
+ IFLA_OFFLOAD_XSTATS_HW_S_INFO = 0x2
+ IFLA_OFFLOAD_XSTATS_L3_STATS = 0x3
+ IFLA_OFFLOAD_XSTATS_HW_S_INFO_UNSPEC = 0x0
+ IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST = 0x1
+ IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED = 0x2
IFLA_XDP_UNSPEC = 0x0
IFLA_XDP_FD = 0x1
IFLA_XDP_ATTACHED = 0x2
@@ -1882,6 +1940,11 @@ const (
IFLA_RMNET_UNSPEC = 0x0
IFLA_RMNET_MUX_ID = 0x1
IFLA_RMNET_FLAGS = 0x2
+ IFLA_MCTP_UNSPEC = 0x0
+ IFLA_MCTP_NET = 0x1
+ IFLA_DSA_UNSPEC = 0x0
+ IFLA_DSA_CONDUIT = 0x1
+ IFLA_DSA_MASTER = 0x1
)
const (
@@ -1977,7 +2040,7 @@ const (
NFT_MSG_GETFLOWTABLE = 0x17
NFT_MSG_DELFLOWTABLE = 0x18
NFT_MSG_GETRULE_RESET = 0x19
- NFT_MSG_MAX = 0x21
+ NFT_MSG_MAX = 0x22
NFTA_LIST_UNSPEC = 0x0
NFTA_LIST_ELEM = 0x1
NFTA_HOOK_UNSPEC = 0x0
@@ -2672,6 +2735,7 @@ const (
BPF_PROG_TYPE_LSM = 0x1d
BPF_PROG_TYPE_SK_LOOKUP = 0x1e
BPF_PROG_TYPE_SYSCALL = 0x1f
+ BPF_PROG_TYPE_NETFILTER = 0x20
BPF_CGROUP_INET_INGRESS = 0x0
BPF_CGROUP_INET_EGRESS = 0x1
BPF_CGROUP_INET_SOCK_CREATE = 0x2
@@ -2716,6 +2780,11 @@ const (
BPF_PERF_EVENT = 0x29
BPF_TRACE_KPROBE_MULTI = 0x2a
BPF_LSM_CGROUP = 0x2b
+ BPF_STRUCT_OPS = 0x2c
+ BPF_NETFILTER = 0x2d
+ BPF_TCX_INGRESS = 0x2e
+ BPF_TCX_EGRESS = 0x2f
+ BPF_TRACE_UPROBE_MULTI = 0x30
BPF_LINK_TYPE_UNSPEC = 0x0
BPF_LINK_TYPE_RAW_TRACEPOINT = 0x1
BPF_LINK_TYPE_TRACING = 0x2
@@ -2726,6 +2795,18 @@ const (
BPF_LINK_TYPE_PERF_EVENT = 0x7
BPF_LINK_TYPE_KPROBE_MULTI = 0x8
BPF_LINK_TYPE_STRUCT_OPS = 0x9
+ BPF_LINK_TYPE_NETFILTER = 0xa
+ BPF_LINK_TYPE_TCX = 0xb
+ BPF_LINK_TYPE_UPROBE_MULTI = 0xc
+ BPF_PERF_EVENT_UNSPEC = 0x0
+ BPF_PERF_EVENT_UPROBE = 0x1
+ BPF_PERF_EVENT_URETPROBE = 0x2
+ BPF_PERF_EVENT_KPROBE = 0x3
+ BPF_PERF_EVENT_KRETPROBE = 0x4
+ BPF_PERF_EVENT_TRACEPOINT = 0x5
+ BPF_PERF_EVENT_EVENT = 0x6
+ BPF_F_KPROBE_MULTI_RETURN = 0x1
+ BPF_F_UPROBE_MULTI_RETURN = 0x1
BPF_ANY = 0x0
BPF_NOEXIST = 0x1
BPF_EXIST = 0x2
@@ -2743,6 +2824,8 @@ const (
BPF_F_MMAPABLE = 0x400
BPF_F_PRESERVE_ELEMS = 0x800
BPF_F_INNER_MAP = 0x1000
+ BPF_F_LINK = 0x2000
+ BPF_F_PATH_FD = 0x4000
BPF_STATS_RUN_TIME = 0x0
BPF_STACK_BUILD_ID_EMPTY = 0x0
BPF_STACK_BUILD_ID_VALID = 0x1
@@ -2763,6 +2846,7 @@ const (
BPF_F_ZERO_CSUM_TX = 0x2
BPF_F_DONT_FRAGMENT = 0x4
BPF_F_SEQ_NUMBER = 0x8
+ BPF_F_NO_TUNNEL_KEY = 0x10
BPF_F_TUNINFO_FLAGS = 0x10
BPF_F_INDEX_MASK = 0xffffffff
BPF_F_CURRENT_CPU = 0xffffffff
@@ -2779,6 +2863,8 @@ const (
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = 0x10
BPF_F_ADJ_ROOM_NO_CSUM_RESET = 0x20
BPF_F_ADJ_ROOM_ENCAP_L2_ETH = 0x40
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = 0x80
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = 0x100
BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff
BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 0x38
BPF_F_SYSCTL_BASE_NAME = 0x1
@@ -2867,6 +2953,8 @@ const (
BPF_DEVCG_DEV_CHAR = 0x2
BPF_FIB_LOOKUP_DIRECT = 0x1
BPF_FIB_LOOKUP_OUTPUT = 0x2
+ BPF_FIB_LOOKUP_SKIP_NEIGH = 0x4
+ BPF_FIB_LOOKUP_TBID = 0x8
BPF_FIB_LKUP_RET_SUCCESS = 0x0
BPF_FIB_LKUP_RET_BLACKHOLE = 0x1
BPF_FIB_LKUP_RET_UNREACHABLE = 0x2
@@ -2902,6 +2990,7 @@ const (
BPF_CORE_ENUMVAL_EXISTS = 0xa
BPF_CORE_ENUMVAL_VALUE = 0xb
BPF_CORE_TYPE_MATCHES = 0xc
+ BPF_F_TIMER_ABS = 0x1
)
const (
@@ -2980,6 +3069,12 @@ type LoopInfo64 struct {
Encrypt_key [32]uint8
Init [2]uint64
}
+type LoopConfig struct {
+ Fd uint32
+ Size uint32
+ Info LoopInfo64
+ _ [8]uint64
+}
type TIPCSocketAddr struct {
Ref uint32
@@ -3368,7 +3463,7 @@ const (
DEVLINK_PORT_FN_ATTR_STATE = 0x2
DEVLINK_PORT_FN_ATTR_OPSTATE = 0x3
DEVLINK_PORT_FN_ATTR_CAPS = 0x4
- DEVLINK_PORT_FUNCTION_ATTR_MAX = 0x4
+ DEVLINK_PORT_FUNCTION_ATTR_MAX = 0x5
)
type FsverityDigest struct {
@@ -4152,7 +4247,8 @@ const (
)
type LandlockRulesetAttr struct {
- Access_fs uint64
+ Access_fs uint64
+ Access_net uint64
}
type LandlockPathBeneathAttr struct {
@@ -4499,7 +4595,7 @@ const (
NL80211_ATTR_MAC_HINT = 0xc8
NL80211_ATTR_MAC_MASK = 0xd7
NL80211_ATTR_MAX_AP_ASSOC_STA = 0xca
- NL80211_ATTR_MAX = 0x145
+ NL80211_ATTR_MAX = 0x146
NL80211_ATTR_MAX_CRIT_PROT_DURATION = 0xb4
NL80211_ATTR_MAX_CSA_COUNTERS = 0xce
NL80211_ATTR_MAX_MATCH_SETS = 0x85
@@ -4869,7 +4965,7 @@ const (
NL80211_CMD_LEAVE_IBSS = 0x2c
NL80211_CMD_LEAVE_MESH = 0x45
NL80211_CMD_LEAVE_OCB = 0x6d
- NL80211_CMD_MAX = 0x99
+ NL80211_CMD_MAX = 0x9a
NL80211_CMD_MICHAEL_MIC_FAILURE = 0x29
NL80211_CMD_MODIFY_LINK_STA = 0x97
NL80211_CMD_NAN_MATCH = 0x78
@@ -5103,7 +5199,7 @@ const (
NL80211_FREQUENCY_ATTR_GO_CONCURRENT = 0xf
NL80211_FREQUENCY_ATTR_INDOOR_ONLY = 0xe
NL80211_FREQUENCY_ATTR_IR_CONCURRENT = 0xf
- NL80211_FREQUENCY_ATTR_MAX = 0x1b
+ NL80211_FREQUENCY_ATTR_MAX = 0x1c
NL80211_FREQUENCY_ATTR_MAX_TX_POWER = 0x6
NL80211_FREQUENCY_ATTR_NO_10MHZ = 0x11
NL80211_FREQUENCY_ATTR_NO_160MHZ = 0xc
@@ -5503,7 +5599,7 @@ const (
NL80211_RATE_INFO_HE_RU_ALLOC_52 = 0x1
NL80211_RATE_INFO_HE_RU_ALLOC_996 = 0x5
NL80211_RATE_INFO_HE_RU_ALLOC = 0x11
- NL80211_RATE_INFO_MAX = 0x16
+ NL80211_RATE_INFO_MAX = 0x1d
NL80211_RATE_INFO_MCS = 0x2
NL80211_RATE_INFO_SHORT_GI = 0x4
NL80211_RATE_INFO_VHT_MCS = 0x6
@@ -5516,7 +5612,7 @@ const (
NL80211_REGDOM_TYPE_CUSTOM_WORLD = 0x2
NL80211_REGDOM_TYPE_INTERSECTION = 0x3
NL80211_REGDOM_TYPE_WORLD = 0x1
- NL80211_REG_RULE_ATTR_MAX = 0x7
+ NL80211_REG_RULE_ATTR_MAX = 0x8
NL80211_REKEY_DATA_AKM = 0x4
NL80211_REKEY_DATA_KCK = 0x2
NL80211_REKEY_DATA_KEK = 0x1
@@ -5868,3 +5964,30 @@ const (
VIRTIO_NET_HDR_GSO_UDP_L4 = 0x5
VIRTIO_NET_HDR_GSO_ECN = 0x80
)
+
+type SchedAttr struct {
+ Size uint32
+ Policy uint32
+ Flags uint64
+ Nice int32
+ Priority uint32
+ Runtime uint64
+ Deadline uint64
+ Period uint64
+ Util_min uint32
+ Util_max uint32
+}
+
+const SizeofSchedAttr = 0x38
+
+type Cachestat_t struct {
+ Cache uint64
+ Dirty uint64
+ Writeback uint64
+ Evicted uint64
+ Recently_evicted uint64
+}
+type CachestatRange struct {
+ Off uint64
+ Len uint64
+}
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go
index 6d8acbcc5..438a30aff 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && linux
-// +build 386,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
index 59293c688..adceca355 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && linux
-// +build amd64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go
index 40cfa38c2..eeaa00a37 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && linux
-// +build arm,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
index 055bc4216..6739aa91d 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && linux
-// +build arm64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go
index f28affbc6..9920ef631 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build loong64 && linux
-// +build loong64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go
index 9d71e7ccd..2923b799a 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips && linux
-// +build mips,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go
index fd5ccd332..ce2750ee4 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips64 && linux
-// +build mips64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go
index 7704de77a..3038811d7 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips64le && linux
-// +build mips64le,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go
index df00b8757..efc6fed18 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mipsle && linux
-// +build mipsle,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go
index 0942840db..9a654b75a 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc && linux
-// +build ppc,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go
index 034874395..40d358e33 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64 && linux
-// +build ppc64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go
index bad067047..148c6ceb8 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64le && linux
-// +build ppc64le,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
index 83c69c119..72ba81543 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build riscv64 && linux
-// +build riscv64,linux
package unix
@@ -733,6 +732,10 @@ const (
RISCV_HWPROBE_KEY_IMA_EXT_0 = 0x4
RISCV_HWPROBE_IMA_FD = 0x1
RISCV_HWPROBE_IMA_C = 0x2
+ RISCV_HWPROBE_IMA_V = 0x4
+ RISCV_HWPROBE_EXT_ZBA = 0x8
+ RISCV_HWPROBE_EXT_ZBB = 0x10
+ RISCV_HWPROBE_EXT_ZBS = 0x20
RISCV_HWPROBE_KEY_CPUPERF_0 = 0x5
RISCV_HWPROBE_MISALIGNED_UNKNOWN = 0x0
RISCV_HWPROBE_MISALIGNED_EMULATED = 0x1
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go
index aa268d025..71e765508 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build s390x && linux
-// +build s390x,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go
index 444045b6c..4abbdb9de 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build sparc64 && linux
-// +build sparc64,linux
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_netbsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_netbsd_386.go
index 9bc4c8f9d..f22e7947d 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_netbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_netbsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && netbsd
-// +build 386,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_netbsd_amd64.go
index bb05f655d..066a7d83d 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_netbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_netbsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && netbsd
-// +build amd64,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm.go
index db40e3a19..439548ec9 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && netbsd
-// +build arm,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm64.go
index 11121151c..16085d3bb 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && netbsd
-// +build arm64,netbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go
index 26eba23b7..afd13a3af 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build 386 && openbsd
-// +build 386,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go
index 5a5479886..5d97f1f9b 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && openbsd
-// +build amd64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go
index be58c4e1f..34871cdc1 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm && openbsd
-// +build arm,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go
index 52338266c..5911bceb3 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build arm64 && openbsd
-// +build arm64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go
index 605cfdb12..e4f24f3bc 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build mips64 && openbsd
-// +build mips64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_ppc64.go
index d6724c010..ca50a7930 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_ppc64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build ppc64 && openbsd
-// +build ppc64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_riscv64.go
index ddfd27a43..d7d7f7902 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_riscv64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build riscv64 && openbsd
-// +build riscv64,openbsd
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go
index 0400747c6..14160576d 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go
@@ -2,7 +2,6 @@
// Code generated by the command above; see README.md. DO NOT EDIT.
//go:build amd64 && solaris
-// +build amd64,solaris
package unix
diff --git a/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go b/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go
index aec1efcb3..54f31be63 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build zos && s390x
-// +build zos,s390x
// Hand edited based on ztypes_linux_s390x.go
// TODO: auto-generate.
diff --git a/vendor/golang.org/x/sys/windows/aliases.go b/vendor/golang.org/x/sys/windows/aliases.go
index a20ebea63..ce2d713d6 100644
--- a/vendor/golang.org/x/sys/windows/aliases.go
+++ b/vendor/golang.org/x/sys/windows/aliases.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build windows && go1.9
-// +build windows,go1.9
package windows
diff --git a/vendor/golang.org/x/sys/windows/empty.s b/vendor/golang.org/x/sys/windows/empty.s
index fdbbbcd31..ba64caca5 100644
--- a/vendor/golang.org/x/sys/windows/empty.s
+++ b/vendor/golang.org/x/sys/windows/empty.s
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build !go1.12
-// +build !go1.12
// This file is here to allow bodyless functions with go:linkname for Go 1.11
// and earlier (see https://golang.org/issue/23311).
diff --git a/vendor/golang.org/x/sys/windows/env_windows.go b/vendor/golang.org/x/sys/windows/env_windows.go
index b8ad19250..d4577a423 100644
--- a/vendor/golang.org/x/sys/windows/env_windows.go
+++ b/vendor/golang.org/x/sys/windows/env_windows.go
@@ -37,14 +37,17 @@ func (token Token) Environ(inheritExisting bool) (env []string, err error) {
return nil, err
}
defer DestroyEnvironmentBlock(block)
- blockp := unsafe.Pointer(block)
- for {
- entry := UTF16PtrToString((*uint16)(blockp))
- if len(entry) == 0 {
- break
+ size := unsafe.Sizeof(*block)
+ for *block != 0 {
+ // find NUL terminator
+ end := unsafe.Pointer(block)
+ for *(*uint16)(end) != 0 {
+ end = unsafe.Add(end, size)
}
- env = append(env, entry)
- blockp = unsafe.Add(blockp, 2*(len(entry)+1))
+
+ entry := unsafe.Slice(block, (uintptr(end)-uintptr(unsafe.Pointer(block)))/size)
+ env = append(env, UTF16ToString(entry))
+ block = (*uint16)(unsafe.Add(end, size))
}
return env, nil
}
diff --git a/vendor/golang.org/x/sys/windows/eventlog.go b/vendor/golang.org/x/sys/windows/eventlog.go
index 2cd60645e..6c366955d 100644
--- a/vendor/golang.org/x/sys/windows/eventlog.go
+++ b/vendor/golang.org/x/sys/windows/eventlog.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build windows
-// +build windows
package windows
diff --git a/vendor/golang.org/x/sys/windows/exec_windows.go b/vendor/golang.org/x/sys/windows/exec_windows.go
index a52e0331d..9cabbb694 100644
--- a/vendor/golang.org/x/sys/windows/exec_windows.go
+++ b/vendor/golang.org/x/sys/windows/exec_windows.go
@@ -22,7 +22,7 @@ import (
// but only if there is space or tab inside s.
func EscapeArg(s string) string {
if len(s) == 0 {
- return "\"\""
+ return `""`
}
n := len(s)
hasSpace := false
@@ -35,7 +35,7 @@ func EscapeArg(s string) string {
}
}
if hasSpace {
- n += 2
+ n += 2 // Reserve space for quotes.
}
if n == len(s) {
return s
@@ -82,20 +82,68 @@ func EscapeArg(s string) string {
// in CreateProcess's CommandLine argument, CreateService/ChangeServiceConfig's BinaryPathName argument,
// or any program that uses CommandLineToArgv.
func ComposeCommandLine(args []string) string {
- var commandLine string
- for i := range args {
- if i > 0 {
- commandLine += " "
- }
- commandLine += EscapeArg(args[i])
+ if len(args) == 0 {
+ return ""
}
- return commandLine
+
+ // Per https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw:
+ // “This function accepts command lines that contain a program name; the
+ // program name can be enclosed in quotation marks or not.”
+ //
+ // Unfortunately, it provides no means of escaping interior quotation marks
+ // within that program name, and we have no way to report them here.
+ prog := args[0]
+ mustQuote := len(prog) == 0
+ for i := 0; i < len(prog); i++ {
+ c := prog[i]
+ if c <= ' ' || (c == '"' && i == 0) {
+ // Force quotes for not only the ASCII space and tab as described in the
+ // MSDN article, but also ASCII control characters.
+ // The documentation for CommandLineToArgvW doesn't say what happens when
+ // the first argument is not a valid program name, but it empirically
+ // seems to drop unquoted control characters.
+ mustQuote = true
+ break
+ }
+ }
+ var commandLine []byte
+ if mustQuote {
+ commandLine = make([]byte, 0, len(prog)+2)
+ commandLine = append(commandLine, '"')
+ for i := 0; i < len(prog); i++ {
+ c := prog[i]
+ if c == '"' {
+ // This quote would interfere with our surrounding quotes.
+ // We have no way to report an error, so just strip out
+ // the offending character instead.
+ continue
+ }
+ commandLine = append(commandLine, c)
+ }
+ commandLine = append(commandLine, '"')
+ } else {
+ if len(args) == 1 {
+ // args[0] is a valid command line representing itself.
+ // No need to allocate a new slice or string for it.
+ return prog
+ }
+ commandLine = []byte(prog)
+ }
+
+ for _, arg := range args[1:] {
+ commandLine = append(commandLine, ' ')
+ // TODO(bcmills): since we're already appending to a slice, it would be nice
+ // to avoid the intermediate allocations of EscapeArg.
+ // Perhaps we can factor out an appendEscapedArg function.
+ commandLine = append(commandLine, EscapeArg(arg)...)
+ }
+ return string(commandLine)
}
// DecomposeCommandLine breaks apart its argument command line into unescaped parts using CommandLineToArgv,
// as gathered from GetCommandLine, QUERY_SERVICE_CONFIG's BinaryPathName argument, or elsewhere that
// command lines are passed around.
-// DecomposeCommandLine returns error if commandLine contains NUL.
+// DecomposeCommandLine returns an error if commandLine contains NUL.
func DecomposeCommandLine(commandLine string) ([]string, error) {
if len(commandLine) == 0 {
return []string{}, nil
@@ -105,18 +153,35 @@ func DecomposeCommandLine(commandLine string) ([]string, error) {
return nil, errorspkg.New("string with NUL passed to DecomposeCommandLine")
}
var argc int32
- argv, err := CommandLineToArgv(&utf16CommandLine[0], &argc)
+ argv, err := commandLineToArgv(&utf16CommandLine[0], &argc)
if err != nil {
return nil, err
}
defer LocalFree(Handle(unsafe.Pointer(argv)))
+
var args []string
- for _, v := range (*argv)[:argc] {
- args = append(args, UTF16ToString((*v)[:]))
+ for _, p := range unsafe.Slice(argv, argc) {
+ args = append(args, UTF16PtrToString(p))
}
return args, nil
}
+// CommandLineToArgv parses a Unicode command line string and sets
+// argc to the number of parsed arguments.
+//
+// The returned memory should be freed using a single call to LocalFree.
+//
+// Note that although the return type of CommandLineToArgv indicates 8192
+// entries of up to 8192 characters each, the actual count of parsed arguments
+// may exceed 8192, and the documentation for CommandLineToArgvW does not mention
+// any bound on the lengths of the individual argument strings.
+// (See https://go.dev/issue/63236.)
+func CommandLineToArgv(cmd *uint16, argc *int32) (argv *[8192]*[8192]uint16, err error) {
+ argp, err := commandLineToArgv(cmd, argc)
+ argv = (*[8192]*[8192]uint16)(unsafe.Pointer(argp))
+ return argv, err
+}
+
func CloseOnExec(fd Handle) {
SetHandleInformation(Handle(fd), HANDLE_FLAG_INHERIT, 0)
}
diff --git a/vendor/golang.org/x/sys/windows/mksyscall.go b/vendor/golang.org/x/sys/windows/mksyscall.go
index 8563f79c5..dbcdb090c 100644
--- a/vendor/golang.org/x/sys/windows/mksyscall.go
+++ b/vendor/golang.org/x/sys/windows/mksyscall.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build generate
-// +build generate
package windows
diff --git a/vendor/golang.org/x/sys/windows/race.go b/vendor/golang.org/x/sys/windows/race.go
index 9196b089c..0f1bdc386 100644
--- a/vendor/golang.org/x/sys/windows/race.go
+++ b/vendor/golang.org/x/sys/windows/race.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build windows && race
-// +build windows,race
package windows
diff --git a/vendor/golang.org/x/sys/windows/race0.go b/vendor/golang.org/x/sys/windows/race0.go
index 7bae4817a..0c78da78b 100644
--- a/vendor/golang.org/x/sys/windows/race0.go
+++ b/vendor/golang.org/x/sys/windows/race0.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build windows && !race
-// +build windows,!race
package windows
diff --git a/vendor/golang.org/x/sys/windows/security_windows.go b/vendor/golang.org/x/sys/windows/security_windows.go
index d414ef13b..26be94a8a 100644
--- a/vendor/golang.org/x/sys/windows/security_windows.go
+++ b/vendor/golang.org/x/sys/windows/security_windows.go
@@ -7,8 +7,6 @@ package windows
import (
"syscall"
"unsafe"
-
- "golang.org/x/sys/internal/unsafeheader"
)
const (
@@ -1341,21 +1339,14 @@ func (selfRelativeSD *SECURITY_DESCRIPTOR) copySelfRelativeSecurityDescriptor()
sdLen = min
}
- var src []byte
- h := (*unsafeheader.Slice)(unsafe.Pointer(&src))
- h.Data = unsafe.Pointer(selfRelativeSD)
- h.Len = sdLen
- h.Cap = sdLen
-
+ src := unsafe.Slice((*byte)(unsafe.Pointer(selfRelativeSD)), sdLen)
+ // SECURITY_DESCRIPTOR has pointers in it, which means checkptr expects for it to
+ // be aligned properly. When we're copying a Windows-allocated struct to a
+ // Go-allocated one, make sure that the Go allocation is aligned to the
+ // pointer size.
const psize = int(unsafe.Sizeof(uintptr(0)))
-
- var dst []byte
- h = (*unsafeheader.Slice)(unsafe.Pointer(&dst))
alloc := make([]uintptr, (sdLen+psize-1)/psize)
- h.Data = (*unsafeheader.Slice)(unsafe.Pointer(&alloc)).Data
- h.Len = sdLen
- h.Cap = sdLen
-
+ dst := unsafe.Slice((*byte)(unsafe.Pointer(&alloc[0])), sdLen)
copy(dst, src)
return (*SECURITY_DESCRIPTOR)(unsafe.Pointer(&dst[0]))
}
diff --git a/vendor/golang.org/x/sys/windows/service.go b/vendor/golang.org/x/sys/windows/service.go
index c44a1b963..a9dc6308d 100644
--- a/vendor/golang.org/x/sys/windows/service.go
+++ b/vendor/golang.org/x/sys/windows/service.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build windows
-// +build windows
package windows
diff --git a/vendor/golang.org/x/sys/windows/str.go b/vendor/golang.org/x/sys/windows/str.go
index 4fc01434e..6a4f9ce6a 100644
--- a/vendor/golang.org/x/sys/windows/str.go
+++ b/vendor/golang.org/x/sys/windows/str.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build windows
-// +build windows
package windows
diff --git a/vendor/golang.org/x/sys/windows/syscall.go b/vendor/golang.org/x/sys/windows/syscall.go
index 8732cdb95..e85ed6b9c 100644
--- a/vendor/golang.org/x/sys/windows/syscall.go
+++ b/vendor/golang.org/x/sys/windows/syscall.go
@@ -3,7 +3,6 @@
// license that can be found in the LICENSE file.
//go:build windows
-// +build windows
// Package windows contains an interface to the low-level operating system
// primitives. OS details vary depending on the underlying system, and
diff --git a/vendor/golang.org/x/sys/windows/syscall_windows.go b/vendor/golang.org/x/sys/windows/syscall_windows.go
index 373d16388..6395a031d 100644
--- a/vendor/golang.org/x/sys/windows/syscall_windows.go
+++ b/vendor/golang.org/x/sys/windows/syscall_windows.go
@@ -15,8 +15,6 @@ import (
"time"
"unicode/utf16"
"unsafe"
-
- "golang.org/x/sys/internal/unsafeheader"
)
type Handle uintptr
@@ -127,8 +125,7 @@ func UTF16PtrToString(p *uint16) string {
for ptr := unsafe.Pointer(p); *(*uint16)(ptr) != 0; n++ {
ptr = unsafe.Pointer(uintptr(ptr) + unsafe.Sizeof(*p))
}
-
- return string(utf16.Decode(unsafe.Slice(p, n)))
+ return UTF16ToString(unsafe.Slice(p, n))
}
func Getpagesize() int { return 4096 }
@@ -157,6 +154,8 @@ func NewCallbackCDecl(fn interface{}) uintptr {
//sys GetModuleFileName(module Handle, filename *uint16, size uint32) (n uint32, err error) = kernel32.GetModuleFileNameW
//sys GetModuleHandleEx(flags uint32, moduleName *uint16, module *Handle) (err error) = kernel32.GetModuleHandleExW
//sys SetDefaultDllDirectories(directoryFlags uint32) (err error)
+//sys AddDllDirectory(path *uint16) (cookie uintptr, err error) = kernel32.AddDllDirectory
+//sys RemoveDllDirectory(cookie uintptr) (err error) = kernel32.RemoveDllDirectory
//sys SetDllDirectory(path string) (err error) = kernel32.SetDllDirectoryW
//sys GetVersion() (ver uint32, err error)
//sys FormatMessage(flags uint32, msgsrc uintptr, msgid uint32, langid uint32, buf []uint16, args *byte) (n uint32, err error) = FormatMessageW
@@ -194,6 +193,7 @@ func NewCallbackCDecl(fn interface{}) uintptr {
//sys GetComputerName(buf *uint16, n *uint32) (err error) = GetComputerNameW
//sys GetComputerNameEx(nametype uint32, buf *uint16, n *uint32) (err error) = GetComputerNameExW
//sys SetEndOfFile(handle Handle) (err error)
+//sys SetFileValidData(handle Handle, validDataLength int64) (err error)
//sys GetSystemTimeAsFileTime(time *Filetime)
//sys GetSystemTimePreciseAsFileTime(time *Filetime)
//sys GetTimeZoneInformation(tzi *Timezoneinformation) (rc uint32, err error) [failretval==0xffffffff]
@@ -216,7 +216,7 @@ func NewCallbackCDecl(fn interface{}) uintptr {
//sys shGetKnownFolderPath(id *KNOWNFOLDERID, flags uint32, token Token, path **uint16) (ret error) = shell32.SHGetKnownFolderPath
//sys TerminateProcess(handle Handle, exitcode uint32) (err error)
//sys GetExitCodeProcess(handle Handle, exitcode *uint32) (err error)
-//sys GetStartupInfo(startupInfo *StartupInfo) (err error) = GetStartupInfoW
+//sys getStartupInfo(startupInfo *StartupInfo) = GetStartupInfoW
//sys GetProcessTimes(handle Handle, creationTime *Filetime, exitTime *Filetime, kernelTime *Filetime, userTime *Filetime) (err error)
//sys DuplicateHandle(hSourceProcessHandle Handle, hSourceHandle Handle, hTargetProcessHandle Handle, lpTargetHandle *Handle, dwDesiredAccess uint32, bInheritHandle bool, dwOptions uint32) (err error)
//sys WaitForSingleObject(handle Handle, waitMilliseconds uint32) (event uint32, err error) [failretval==0xffffffff]
@@ -235,12 +235,13 @@ func NewCallbackCDecl(fn interface{}) uintptr {
//sys CreateEnvironmentBlock(block **uint16, token Token, inheritExisting bool) (err error) = userenv.CreateEnvironmentBlock
//sys DestroyEnvironmentBlock(block *uint16) (err error) = userenv.DestroyEnvironmentBlock
//sys getTickCount64() (ms uint64) = kernel32.GetTickCount64
+//sys GetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error)
//sys SetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error)
//sys GetFileAttributes(name *uint16) (attrs uint32, err error) [failretval==INVALID_FILE_ATTRIBUTES] = kernel32.GetFileAttributesW
//sys SetFileAttributes(name *uint16, attrs uint32) (err error) = kernel32.SetFileAttributesW
//sys GetFileAttributesEx(name *uint16, level uint32, info *byte) (err error) = kernel32.GetFileAttributesExW
//sys GetCommandLine() (cmd *uint16) = kernel32.GetCommandLineW
-//sys CommandLineToArgv(cmd *uint16, argc *int32) (argv *[8192]*[8192]uint16, err error) [failretval==nil] = shell32.CommandLineToArgvW
+//sys commandLineToArgv(cmd *uint16, argc *int32) (argv **uint16, err error) [failretval==nil] = shell32.CommandLineToArgvW
//sys LocalFree(hmem Handle) (handle Handle, err error) [failretval!=0]
//sys LocalAlloc(flags uint32, length uint32) (ptr uintptr, err error)
//sys SetHandleInformation(handle Handle, mask uint32, flags uint32) (err error)
@@ -299,12 +300,15 @@ func NewCallbackCDecl(fn interface{}) uintptr {
//sys RegNotifyChangeKeyValue(key Handle, watchSubtree bool, notifyFilter uint32, event Handle, asynchronous bool) (regerrno error) = advapi32.RegNotifyChangeKeyValue
//sys GetCurrentProcessId() (pid uint32) = kernel32.GetCurrentProcessId
//sys ProcessIdToSessionId(pid uint32, sessionid *uint32) (err error) = kernel32.ProcessIdToSessionId
+//sys ClosePseudoConsole(console Handle) = kernel32.ClosePseudoConsole
+//sys createPseudoConsole(size uint32, in Handle, out Handle, flags uint32, pconsole *Handle) (hr error) = kernel32.CreatePseudoConsole
//sys GetConsoleMode(console Handle, mode *uint32) (err error) = kernel32.GetConsoleMode
//sys SetConsoleMode(console Handle, mode uint32) (err error) = kernel32.SetConsoleMode
//sys GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) = kernel32.GetConsoleScreenBufferInfo
//sys setConsoleCursorPosition(console Handle, position uint32) (err error) = kernel32.SetConsoleCursorPosition
//sys WriteConsole(console Handle, buf *uint16, towrite uint32, written *uint32, reserved *byte) (err error) = kernel32.WriteConsoleW
//sys ReadConsole(console Handle, buf *uint16, toread uint32, read *uint32, inputControl *byte) (err error) = kernel32.ReadConsoleW
+//sys resizePseudoConsole(pconsole Handle, size uint32) (hr error) = kernel32.ResizePseudoConsole
//sys CreateToolhelp32Snapshot(flags uint32, processId uint32) (handle Handle, err error) [failretval==InvalidHandle] = kernel32.CreateToolhelp32Snapshot
//sys Module32First(snapshot Handle, moduleEntry *ModuleEntry32) (err error) = kernel32.Module32FirstW
//sys Module32Next(snapshot Handle, moduleEntry *ModuleEntry32) (err error) = kernel32.Module32NextW
@@ -437,6 +441,10 @@ func NewCallbackCDecl(fn interface{}) uintptr {
//sys DwmGetWindowAttribute(hwnd HWND, attribute uint32, value unsafe.Pointer, size uint32) (ret error) = dwmapi.DwmGetWindowAttribute
//sys DwmSetWindowAttribute(hwnd HWND, attribute uint32, value unsafe.Pointer, size uint32) (ret error) = dwmapi.DwmSetWindowAttribute
+// Windows Multimedia API
+//sys TimeBeginPeriod (period uint32) (err error) [failretval != 0] = winmm.timeBeginPeriod
+//sys TimeEndPeriod (period uint32) (err error) [failretval != 0] = winmm.timeEndPeriod
+
// syscall interface implementation for other packages
// GetCurrentProcess returns the handle for the current process.
@@ -964,7 +972,8 @@ func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, int32, error) {
if n > 0 {
sl += int32(n) + 1
}
- if sa.raw.Path[0] == '@' {
+ if sa.raw.Path[0] == '@' || (sa.raw.Path[0] == 0 && sl > 3) {
+ // Check sl > 3 so we don't change unnamed socket behavior.
sa.raw.Path[0] = 0
// Don't count trailing NUL for abstract address.
sl--
@@ -1624,6 +1633,11 @@ func SetConsoleCursorPosition(console Handle, position Coord) error {
return setConsoleCursorPosition(console, *((*uint32)(unsafe.Pointer(&position))))
}
+func GetStartupInfo(startupInfo *StartupInfo) error {
+ getStartupInfo(startupInfo)
+ return nil
+}
+
func (s NTStatus) Errno() syscall.Errno {
return rtlNtStatusToDosErrorNoTeb(s)
}
@@ -1658,12 +1672,8 @@ func NewNTUnicodeString(s string) (*NTUnicodeString, error) {
// Slice returns a uint16 slice that aliases the data in the NTUnicodeString.
func (s *NTUnicodeString) Slice() []uint16 {
- var slice []uint16
- hdr := (*unsafeheader.Slice)(unsafe.Pointer(&slice))
- hdr.Data = unsafe.Pointer(s.Buffer)
- hdr.Len = int(s.Length)
- hdr.Cap = int(s.MaximumLength)
- return slice
+ slice := unsafe.Slice(s.Buffer, s.MaximumLength)
+ return slice[:s.Length]
}
func (s *NTUnicodeString) String() string {
@@ -1686,12 +1696,8 @@ func NewNTString(s string) (*NTString, error) {
// Slice returns a byte slice that aliases the data in the NTString.
func (s *NTString) Slice() []byte {
- var slice []byte
- hdr := (*unsafeheader.Slice)(unsafe.Pointer(&slice))
- hdr.Data = unsafe.Pointer(s.Buffer)
- hdr.Len = int(s.Length)
- hdr.Cap = int(s.MaximumLength)
- return slice
+ slice := unsafe.Slice(s.Buffer, s.MaximumLength)
+ return slice[:s.Length]
}
func (s *NTString) String() string {
@@ -1743,10 +1749,7 @@ func LoadResourceData(module, resInfo Handle) (data []byte, err error) {
if err != nil {
return
}
- h := (*unsafeheader.Slice)(unsafe.Pointer(&data))
- h.Data = unsafe.Pointer(ptr)
- h.Len = int(size)
- h.Cap = int(size)
+ data = unsafe.Slice((*byte)(unsafe.Pointer(ptr)), size)
return
}
@@ -1817,3 +1820,17 @@ type PSAPI_WORKING_SET_EX_INFORMATION struct {
// A PSAPI_WORKING_SET_EX_BLOCK union that indicates the attributes of the page at VirtualAddress.
VirtualAttributes PSAPI_WORKING_SET_EX_BLOCK
}
+
+// CreatePseudoConsole creates a windows pseudo console.
+func CreatePseudoConsole(size Coord, in Handle, out Handle, flags uint32, pconsole *Handle) error {
+ // We need this wrapper to manually cast Coord to uint32. The autogenerated wrappers only
+ // accept arguments that can be casted to uintptr, and Coord can't.
+ return createPseudoConsole(*((*uint32)(unsafe.Pointer(&size))), in, out, flags, pconsole)
+}
+
+// ResizePseudoConsole resizes the internal buffers of the pseudo console to the width and height specified in `size`.
+func ResizePseudoConsole(pconsole Handle, size Coord) error {
+ // We need this wrapper to manually cast Coord to uint32. The autogenerated wrappers only
+ // accept arguments that can be casted to uintptr, and Coord can't.
+ return resizePseudoConsole(pconsole, *((*uint32)(unsafe.Pointer(&size))))
+}
diff --git a/vendor/golang.org/x/sys/windows/types_windows.go b/vendor/golang.org/x/sys/windows/types_windows.go
index 88e62a638..359780f6a 100644
--- a/vendor/golang.org/x/sys/windows/types_windows.go
+++ b/vendor/golang.org/x/sys/windows/types_windows.go
@@ -247,6 +247,7 @@ const (
PROC_THREAD_ATTRIBUTE_MITIGATION_POLICY = 0x00020007
PROC_THREAD_ATTRIBUTE_UMS_THREAD = 0x00030006
PROC_THREAD_ATTRIBUTE_PROTECTION_LEVEL = 0x0002000b
+ PROC_THREAD_ATTRIBUTE_PSEUDOCONSOLE = 0x00020016
)
const (
@@ -1093,7 +1094,33 @@ const (
SOMAXCONN = 0x7fffffff
- TCP_NODELAY = 1
+ TCP_NODELAY = 1
+ TCP_EXPEDITED_1122 = 2
+ TCP_KEEPALIVE = 3
+ TCP_MAXSEG = 4
+ TCP_MAXRT = 5
+ TCP_STDURG = 6
+ TCP_NOURG = 7
+ TCP_ATMARK = 8
+ TCP_NOSYNRETRIES = 9
+ TCP_TIMESTAMPS = 10
+ TCP_OFFLOAD_PREFERENCE = 11
+ TCP_CONGESTION_ALGORITHM = 12
+ TCP_DELAY_FIN_ACK = 13
+ TCP_MAXRTMS = 14
+ TCP_FASTOPEN = 15
+ TCP_KEEPCNT = 16
+ TCP_KEEPIDLE = TCP_KEEPALIVE
+ TCP_KEEPINTVL = 17
+ TCP_FAIL_CONNECT_ON_ICMP_ERROR = 18
+ TCP_ICMP_ERROR_INFO = 19
+
+ UDP_NOCHECKSUM = 1
+ UDP_SEND_MSG_SIZE = 2
+ UDP_RECV_MAX_COALESCED_SIZE = 3
+ UDP_CHECKSUM_COVERAGE = 20
+
+ UDP_COALESCED_INFO = 3
SHUT_RD = 0
SHUT_WR = 1
@@ -2139,6 +2166,12 @@ const (
ENABLE_LVB_GRID_WORLDWIDE = 0x10
)
+// Pseudo console related constants used for the flags parameter to
+// CreatePseudoConsole. See: https://learn.microsoft.com/en-us/windows/console/createpseudoconsole
+const (
+ PSEUDOCONSOLE_INHERIT_CURSOR = 0x1
+)
+
type Coord struct {
X int16
Y int16
diff --git a/vendor/golang.org/x/sys/windows/zsyscall_windows.go b/vendor/golang.org/x/sys/windows/zsyscall_windows.go
index 566dd3e31..e8791c82c 100644
--- a/vendor/golang.org/x/sys/windows/zsyscall_windows.go
+++ b/vendor/golang.org/x/sys/windows/zsyscall_windows.go
@@ -55,6 +55,7 @@ var (
moduser32 = NewLazySystemDLL("user32.dll")
moduserenv = NewLazySystemDLL("userenv.dll")
modversion = NewLazySystemDLL("version.dll")
+ modwinmm = NewLazySystemDLL("winmm.dll")
modwintrust = NewLazySystemDLL("wintrust.dll")
modws2_32 = NewLazySystemDLL("ws2_32.dll")
modwtsapi32 = NewLazySystemDLL("wtsapi32.dll")
@@ -183,10 +184,12 @@ var (
procGetAdaptersInfo = modiphlpapi.NewProc("GetAdaptersInfo")
procGetBestInterfaceEx = modiphlpapi.NewProc("GetBestInterfaceEx")
procGetIfEntry = modiphlpapi.NewProc("GetIfEntry")
+ procAddDllDirectory = modkernel32.NewProc("AddDllDirectory")
procAssignProcessToJobObject = modkernel32.NewProc("AssignProcessToJobObject")
procCancelIo = modkernel32.NewProc("CancelIo")
procCancelIoEx = modkernel32.NewProc("CancelIoEx")
procCloseHandle = modkernel32.NewProc("CloseHandle")
+ procClosePseudoConsole = modkernel32.NewProc("ClosePseudoConsole")
procConnectNamedPipe = modkernel32.NewProc("ConnectNamedPipe")
procCreateDirectoryW = modkernel32.NewProc("CreateDirectoryW")
procCreateEventExW = modkernel32.NewProc("CreateEventExW")
@@ -201,6 +204,7 @@ var (
procCreateNamedPipeW = modkernel32.NewProc("CreateNamedPipeW")
procCreatePipe = modkernel32.NewProc("CreatePipe")
procCreateProcessW = modkernel32.NewProc("CreateProcessW")
+ procCreatePseudoConsole = modkernel32.NewProc("CreatePseudoConsole")
procCreateSymbolicLinkW = modkernel32.NewProc("CreateSymbolicLinkW")
procCreateToolhelp32Snapshot = modkernel32.NewProc("CreateToolhelp32Snapshot")
procDefineDosDeviceW = modkernel32.NewProc("DefineDosDeviceW")
@@ -250,6 +254,7 @@ var (
procGetFileAttributesW = modkernel32.NewProc("GetFileAttributesW")
procGetFileInformationByHandle = modkernel32.NewProc("GetFileInformationByHandle")
procGetFileInformationByHandleEx = modkernel32.NewProc("GetFileInformationByHandleEx")
+ procGetFileTime = modkernel32.NewProc("GetFileTime")
procGetFileType = modkernel32.NewProc("GetFileType")
procGetFinalPathNameByHandleW = modkernel32.NewProc("GetFinalPathNameByHandleW")
procGetFullPathNameW = modkernel32.NewProc("GetFullPathNameW")
@@ -326,7 +331,9 @@ var (
procReadProcessMemory = modkernel32.NewProc("ReadProcessMemory")
procReleaseMutex = modkernel32.NewProc("ReleaseMutex")
procRemoveDirectoryW = modkernel32.NewProc("RemoveDirectoryW")
+ procRemoveDllDirectory = modkernel32.NewProc("RemoveDllDirectory")
procResetEvent = modkernel32.NewProc("ResetEvent")
+ procResizePseudoConsole = modkernel32.NewProc("ResizePseudoConsole")
procResumeThread = modkernel32.NewProc("ResumeThread")
procSetCommTimeouts = modkernel32.NewProc("SetCommTimeouts")
procSetConsoleCursorPosition = modkernel32.NewProc("SetConsoleCursorPosition")
@@ -335,6 +342,7 @@ var (
procSetDefaultDllDirectories = modkernel32.NewProc("SetDefaultDllDirectories")
procSetDllDirectoryW = modkernel32.NewProc("SetDllDirectoryW")
procSetEndOfFile = modkernel32.NewProc("SetEndOfFile")
+ procSetFileValidData = modkernel32.NewProc("SetFileValidData")
procSetEnvironmentVariableW = modkernel32.NewProc("SetEnvironmentVariableW")
procSetErrorMode = modkernel32.NewProc("SetErrorMode")
procSetEvent = modkernel32.NewProc("SetEvent")
@@ -468,6 +476,8 @@ var (
procGetFileVersionInfoSizeW = modversion.NewProc("GetFileVersionInfoSizeW")
procGetFileVersionInfoW = modversion.NewProc("GetFileVersionInfoW")
procVerQueryValueW = modversion.NewProc("VerQueryValueW")
+ proctimeBeginPeriod = modwinmm.NewProc("timeBeginPeriod")
+ proctimeEndPeriod = modwinmm.NewProc("timeEndPeriod")
procWinVerifyTrustEx = modwintrust.NewProc("WinVerifyTrustEx")
procFreeAddrInfoW = modws2_32.NewProc("FreeAddrInfoW")
procGetAddrInfoW = modws2_32.NewProc("GetAddrInfoW")
@@ -1598,6 +1608,15 @@ func GetIfEntry(pIfRow *MibIfRow) (errcode error) {
return
}
+func AddDllDirectory(path *uint16) (cookie uintptr, err error) {
+ r0, _, e1 := syscall.Syscall(procAddDllDirectory.Addr(), 1, uintptr(unsafe.Pointer(path)), 0, 0)
+ cookie = uintptr(r0)
+ if cookie == 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
func AssignProcessToJobObject(job Handle, process Handle) (err error) {
r1, _, e1 := syscall.Syscall(procAssignProcessToJobObject.Addr(), 2, uintptr(job), uintptr(process), 0)
if r1 == 0 {
@@ -1630,6 +1649,11 @@ func CloseHandle(handle Handle) (err error) {
return
}
+func ClosePseudoConsole(console Handle) {
+ syscall.Syscall(procClosePseudoConsole.Addr(), 1, uintptr(console), 0, 0)
+ return
+}
+
func ConnectNamedPipe(pipe Handle, overlapped *Overlapped) (err error) {
r1, _, e1 := syscall.Syscall(procConnectNamedPipe.Addr(), 2, uintptr(pipe), uintptr(unsafe.Pointer(overlapped)), 0)
if r1 == 0 {
@@ -1759,6 +1783,14 @@ func CreateProcess(appName *uint16, commandLine *uint16, procSecurity *SecurityA
return
}
+func createPseudoConsole(size uint32, in Handle, out Handle, flags uint32, pconsole *Handle) (hr error) {
+ r0, _, _ := syscall.Syscall6(procCreatePseudoConsole.Addr(), 5, uintptr(size), uintptr(in), uintptr(out), uintptr(flags), uintptr(unsafe.Pointer(pconsole)), 0)
+ if r0 != 0 {
+ hr = syscall.Errno(r0)
+ }
+ return
+}
+
func CreateSymbolicLink(symlinkfilename *uint16, targetfilename *uint16, flags uint32) (err error) {
r1, _, e1 := syscall.Syscall(procCreateSymbolicLinkW.Addr(), 3, uintptr(unsafe.Pointer(symlinkfilename)), uintptr(unsafe.Pointer(targetfilename)), uintptr(flags))
if r1&0xff == 0 {
@@ -2166,6 +2198,14 @@ func GetFileInformationByHandleEx(handle Handle, class uint32, outBuffer *byte,
return
}
+func GetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error) {
+ r1, _, e1 := syscall.Syscall6(procGetFileTime.Addr(), 4, uintptr(handle), uintptr(unsafe.Pointer(ctime)), uintptr(unsafe.Pointer(atime)), uintptr(unsafe.Pointer(wtime)), 0, 0)
+ if r1 == 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
func GetFileType(filehandle Handle) (n uint32, err error) {
r0, _, e1 := syscall.Syscall(procGetFileType.Addr(), 1, uintptr(filehandle), 0, 0)
n = uint32(r0)
@@ -2367,11 +2407,8 @@ func GetShortPathName(longpath *uint16, shortpath *uint16, buflen uint32) (n uin
return
}
-func GetStartupInfo(startupInfo *StartupInfo) (err error) {
- r1, _, e1 := syscall.Syscall(procGetStartupInfoW.Addr(), 1, uintptr(unsafe.Pointer(startupInfo)), 0, 0)
- if r1 == 0 {
- err = errnoErr(e1)
- }
+func getStartupInfo(startupInfo *StartupInfo) {
+ syscall.Syscall(procGetStartupInfoW.Addr(), 1, uintptr(unsafe.Pointer(startupInfo)), 0, 0)
return
}
@@ -2854,6 +2891,14 @@ func RemoveDirectory(path *uint16) (err error) {
return
}
+func RemoveDllDirectory(cookie uintptr) (err error) {
+ r1, _, e1 := syscall.Syscall(procRemoveDllDirectory.Addr(), 1, uintptr(cookie), 0, 0)
+ if r1 == 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
func ResetEvent(event Handle) (err error) {
r1, _, e1 := syscall.Syscall(procResetEvent.Addr(), 1, uintptr(event), 0, 0)
if r1 == 0 {
@@ -2862,6 +2907,14 @@ func ResetEvent(event Handle) (err error) {
return
}
+func resizePseudoConsole(pconsole Handle, size uint32) (hr error) {
+ r0, _, _ := syscall.Syscall(procResizePseudoConsole.Addr(), 2, uintptr(pconsole), uintptr(size), 0)
+ if r0 != 0 {
+ hr = syscall.Errno(r0)
+ }
+ return
+}
+
func ResumeThread(thread Handle) (ret uint32, err error) {
r0, _, e1 := syscall.Syscall(procResumeThread.Addr(), 1, uintptr(thread), 0, 0)
ret = uint32(r0)
@@ -2936,6 +2989,14 @@ func SetEndOfFile(handle Handle) (err error) {
return
}
+func SetFileValidData(handle Handle, validDataLength int64) (err error) {
+ r1, _, e1 := syscall.Syscall(procSetFileValidData.Addr(), 2, uintptr(handle), uintptr(validDataLength), 0)
+ if r1 == 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
func SetEnvironmentVariable(name *uint16, value *uint16) (err error) {
r1, _, e1 := syscall.Syscall(procSetEnvironmentVariableW.Addr(), 2, uintptr(unsafe.Pointer(name)), uintptr(unsafe.Pointer(value)), 0)
if r1 == 0 {
@@ -3820,9 +3881,9 @@ func setupUninstallOEMInf(infFileName *uint16, flags SUOI, reserved uintptr) (er
return
}
-func CommandLineToArgv(cmd *uint16, argc *int32) (argv *[8192]*[8192]uint16, err error) {
+func commandLineToArgv(cmd *uint16, argc *int32) (argv **uint16, err error) {
r0, _, e1 := syscall.Syscall(procCommandLineToArgvW.Addr(), 2, uintptr(unsafe.Pointer(cmd)), uintptr(unsafe.Pointer(argc)), 0)
- argv = (*[8192]*[8192]uint16)(unsafe.Pointer(r0))
+ argv = (**uint16)(unsafe.Pointer(r0))
if argv == nil {
err = errnoErr(e1)
}
@@ -4017,6 +4078,22 @@ func _VerQueryValue(block unsafe.Pointer, subBlock *uint16, pointerToBufferPoint
return
}
+func TimeBeginPeriod(period uint32) (err error) {
+ r1, _, e1 := syscall.Syscall(proctimeBeginPeriod.Addr(), 1, uintptr(period), 0, 0)
+ if r1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+func TimeEndPeriod(period uint32) (err error) {
+ r1, _, e1 := syscall.Syscall(proctimeEndPeriod.Addr(), 1, uintptr(period), 0, 0)
+ if r1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
func WinVerifyTrustEx(hwnd HWND, actionId *GUID, data *WinTrustData) (ret error) {
r0, _, _ := syscall.Syscall(procWinVerifyTrustEx.Addr(), 3, uintptr(hwnd), uintptr(unsafe.Pointer(actionId)), uintptr(unsafe.Pointer(data)))
if r0 != 0 {
diff --git a/vendor/modules.txt b/vendor/modules.txt
index d25103fce..a1959a9a0 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -461,9 +461,12 @@ github.com/klauspost/compress/internal/cpuinfo
github.com/klauspost/compress/internal/snapref
github.com/klauspost/compress/zstd
github.com/klauspost/compress/zstd/internal/xxhash
-# github.com/klauspost/cpuid/v2 v2.2.5
+# github.com/klauspost/cpuid/v2 v2.2.7
## explicit; go 1.15
github.com/klauspost/cpuid/v2
+# github.com/klauspost/reedsolomon v1.12.1
+## explicit; go 1.18
+github.com/klauspost/reedsolomon
# github.com/koron/go-ssdp v0.0.4
## explicit; go 1.19
github.com/koron/go-ssdp
@@ -1238,11 +1241,10 @@ golang.org/x/net/route
## explicit; go 1.17
golang.org/x/sync/errgroup
golang.org/x/sync/singleflight
-# golang.org/x/sys v0.11.0
-## explicit; go 1.17
+# golang.org/x/sys v0.18.0
+## explicit; go 1.18
golang.org/x/sys/cpu
golang.org/x/sys/execabs
-golang.org/x/sys/internal/unsafeheader
golang.org/x/sys/plan9
golang.org/x/sys/unix
golang.org/x/sys/windows